sync: minja (a72057e519) (#11774 )

Update README.md [no ci] (#11781 )
typo: `\` -> `/` Change the UNIX path separator to` \`.
2025-02-10 09:34:09 +00:00 · 2025-02-10 09:05:57 +01:00 · 2025-02-10 07:17:21 +01:00 · 2025-02-10 07:08:22 +01:00 · 2025-02-09 10:34:49 +00:00 · 2025-02-09 08:43:51 +01:00
38 changed files with 1470 additions and 796 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -2324,5 +2324,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_TTS}));

+    add_opt(common_arg(
+        {"--embd-bge-small-en-default"},
+        string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
+            params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--embd-e5-small-en-default"},
+        string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
+            params.hf_file = "e5-small-v2-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
+
+    add_opt(common_arg(
+        {"--embd-gte-small-default"},
+        string_format("use default gte-small model (note: can download weights from the internet)"),
+        [](common_params & params) {
+            params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
+            params.hf_file = "gte-small-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
+
    return ctx_arg;
 }
--- a/common/chat-template.hpp
+++ b/common/chat-template.hpp
@ -249,16 +249,30 @@ class chat_template {
                    inputs.add_generation_prompt = false;
                    full = apply(inputs);
                }
-
-                if (full.find(prefix) != 0) {
-                    if (prefix.rfind(eos_token_) == prefix.size() - eos_token_.size()) {
-                        prefix = prefix.substr(0, prefix.size() - eos_token_.size());
+                auto eos_pos_last = full.rfind(eos_token_);
+                if (eos_pos_last == prefix.size() - eos_token_.size() ||
+                      (full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
+                    full = full.substr(0, eos_pos_last);
+                }
+                size_t common_prefix_length = 0;
+                for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
+                    if (prefix[i] != full[i]) {
+                        break;
                    }
+                    if (prefix[i] == '<') {
+                        // DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
+                        // but it removes thinking tags for past messages.
+                        // The prefix and full strings diverge at <think> vs. <｜tool▁calls▁begin｜>, we avoid consuming the leading <.
+                        continue;
+                    }
+                    common_prefix_length = i + 1;
                }
-                if (full.find(prefix) != 0) {
+                auto example = full.substr(common_prefix_length);
+                if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
                    fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
+                } else {
+                    tool_call_example_ = example;
                }
-                tool_call_example_ = full.substr(prefix.size());
            }
        } catch (const std::exception & e) {
            fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
@ -363,7 +377,7 @@ class chat_template {
            if (polyfill_tools) {
                adjusted_messages = add_system(inputs.messages,
                    "You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
-                    (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
+                    (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
            } else {
                adjusted_messages = inputs.messages;
            }
--- a/common/llguidance.cpp
+++ b/common/llguidance.cpp
@ -254,10 +254,10 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
        };
    }

-    return new llama_sampler{
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_llg_i,
-        /* .ctx   = */ ctx,
-    };
+        /* .ctx   = */ ctx
+    );
 }

 #else
--- a/common/log.h
+++ b/common/log.h
@ -2,6 +2,7 @@

 #include "ggml.h" // for ggml_log_level

+#define LOG_CLR_TO_EOL  "\033[K\r"
 #define LOG_COL_DEFAULT "\033[0m"
 #define LOG_COL_BOLD    "\033[1m"
 #define LOG_COL_RED     "\033[31m"
--- a/common/minja.hpp
+++ b/common/minja.hpp
@ -1385,6 +1385,13 @@ static std::string strip(const std::string & s) {
  return s.substr(start, end - start + 1);
 }

+static std::string capitalize(const std::string & s) {
+  if (s.empty()) return s;
+  auto result = s;
+  result[0] = std::toupper(result[0]);
+  return result;
+}
+
 static std::string html_escape(const std::string & s) {
  std::string result;
  result.reserve(s.size());
@ -1462,6 +1469,9 @@ public:
          if (method->get_name() == "strip") {
            vargs.expectArgs("strip method", {0, 0}, {0, 0});
            return Value(strip(str));
+          } else if (method->get_name() == "capitalize") {
+            vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
+            return Value(capitalize(str));
          } else if (method->get_name() == "endswith") {
            vargs.expectArgs("endswith method", {1, 1}, {0, 0});
            auto suffix = vargs.args[0].get<std::string>();
@ -1792,7 +1802,7 @@ private:
        auto left = parseStringConcat();
        if (!left) throw std::runtime_error("Expected left side of 'logical compare' expression");

-        static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not[\r\n\s]+in\b)");
+        static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not\s+in\b)");
        static std::regex not_tok(R"(not\b)");
        std::string op_str;
        while (!(op_str = consumeToken(compare_tok)).empty()) {
@ -2171,7 +2181,7 @@ private:
    using TemplateTokenIterator = TemplateTokenVector::const_iterator;

    std::vector<std::string> parseVarNames() {
-      static std::regex varnames_regex(R"(((?:\w+)(?:[\r\n\s]*,[\r\n\s]*(?:\w+))*)[\r\n\s]*)");
+      static std::regex varnames_regex(R"(((?:\w+)(?:\s*,\s*(?:\w+))*)\s*)");

      std::vector<std::string> group;
      if ((group = consumeTokenGroups(varnames_regex)).empty()) throw std::runtime_error("Expected variable names");
@ -2194,13 +2204,13 @@ private:
    }

    TemplateTokenVector tokenize() {
-      static std::regex comment_tok(R"(\{#([-~]?)([\s\S\r\n]*?)([-~]?)#\})");
+      static std::regex comment_tok(R"(\{#([-~]?)([\s\S]*?)([-~]?)#\})");
      static std::regex expr_open_regex(R"(\{\{([-~])?)");
-      static std::regex block_open_regex(R"(^\{%([-~])?[\s\n\r]*)");
+      static std::regex block_open_regex(R"(^\{%([-~])?\s*)");
      static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
      static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
-      static std::regex expr_close_regex(R"([\s\n\r]*([-~])?\}\})");
-      static std::regex block_close_regex(R"([\s\n\r]*([-~])?%\})");
+      static std::regex expr_close_regex(R"(\s*([-~])?\}\})");
+      static std::regex block_close_regex(R"(\s*([-~])?%\})");

      TemplateTokenVector tokens;
      std::vector<std::string> group;
@ -2284,7 +2294,7 @@ private:
              auto post_space = parseBlockClose();
              tokens.push_back(std::make_unique<EndGenerationTemplateToken>(location, pre_space, post_space));
            } else if (keyword == "set") {
-              static std::regex namespaced_var_regex(R"((\w+)[\s\n\r]*\.[\s\n\r]*(\w+))");
+              static std::regex namespaced_var_regex(R"((\w+)\s*\.\s*(\w+))");

              std::string ns;
              std::vector<std::string> var_names;
@ -2336,6 +2346,11 @@ private:
              throw std::runtime_error("Unexpected block: " + keyword);
            }
          } else if (std::regex_search(it, end, match, non_text_open_regex)) {
+            if (!match.position()) {
+                if (match[0] != "{#")
+                    throw std::runtime_error("Internal error: Expected a comment");
+                throw std::runtime_error("Missing end of comment tag");
+            }
            auto text_end = it + match.position();
            text = std::string(it, text_end);
            it = text_end;
@ -2400,7 +2415,7 @@ private:

              auto text = text_token->text;
              if (post_space == SpaceHandling::Strip) {
-                static std::regex trailing_space_regex(R"((\s|\r|\n)+$)");
+                static std::regex trailing_space_regex(R"(\s+$)");
                text = std::regex_replace(text, trailing_space_regex, "");
              } else if (options.lstrip_blocks && it != end) {
                auto i = text.size();
@ -2410,7 +2425,7 @@ private:
                }
              }
              if (pre_space == SpaceHandling::Strip) {
-                static std::regex leading_space_regex(R"(^(\s|\r|\n)+)");
+                static std::regex leading_space_regex(R"(^\s+)");
                text = std::regex_replace(text, leading_space_regex, "");
              } else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
                if (text.length() > 0 && text[0] == '\n') {
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -37,7 +37,7 @@ Once downloaded, place your model in the models folder in llama.cpp.

 ##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
 ```bash
-./llama-cli -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
+./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 ```

 ### Windows:
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -346,7 +346,7 @@ class HttpClient {
        if (!output_file.empty()) {
            output_file_partial = output_file + ".partial";
            if (!out.open(output_file_partial, "ab")) {
-                printe("Failed to open file\n");
+                printe("Failed to open file for writing\n");

                return 1;
            }
@ -535,8 +535,7 @@ class HttpClient {

    static void print_progress(const std::string & progress_prefix, const std::string & progress_bar,
                               const std::string & progress_suffix) {
-        printe("\r%*s\r%s%s| %s", get_terminal_width(), " ", progress_prefix.c_str(), progress_bar.c_str(),
-               progress_suffix.c_str());
+        printe("\r" LOG_CLR_TO_EOL "%s%s| %s", progress_prefix.c_str(), progress_bar.c_str(), progress_suffix.c_str());
    }
    // Function to write data to a file
    static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
@ -797,16 +796,13 @@ class LlamaData {
    llama_model_ptr initialize_model(Opt & opt) {
        ggml_backend_load_all();
        resolve_model(opt.model_);
-        printe(
-            "\r%*s"
-            "\rLoading model",
-            get_terminal_width(), " ");
+        printe("\r" LOG_CLR_TO_EOL "Loading model");
        llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params));
        if (!model) {
            printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
        }

-        printe("\r%*s\r", static_cast<int>(sizeof("Loading model")), " ");
+        printe("\r" LOG_CLR_TO_EOL);
        return model;
    }

@ -969,10 +965,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
 static int read_user_input(std::string & user_input) {
    static const char * prompt_prefix = "> ";
 #ifdef WIN32
-    printf(
-        "\r%*s"
-        "\r" LOG_COL_DEFAULT "%s",
-        get_terminal_width(), " ", prompt_prefix);
+    printf("\r" LOG_CLR_TO_EOL LOG_COL_DEFAULT "%s", prompt_prefix);

    std::getline(std::cin, user_input);
    if (std::cin.eof()) {
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -220,7 +220,7 @@ services:
 The project includes a web-based user interface that enables interaction with the model through the `/chat/completions` endpoint.

 The web UI is developed using:
- `vue` framework for frontend development
+- `react` framework for frontend development
 - `tailwindcss` and `daisyui` for styling
 - `vite` for build tooling

--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -334,24 +334,24 @@ struct server_task {
        if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
                auto schema                  = json_value(data, "json_schema", json::object());
-                LOG_DBG("JSON schema: %s\n", schema.dump(2).c_str());
+                SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
                params.sampling.grammar      = json_schema_to_grammar(schema);
-                LOG_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
+                SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
            } catch (const std::exception & e) {
                throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
            }
        } else {
            params.sampling.grammar      = json_value(data, "grammar", defaults.sampling.grammar);
-            LOG_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
+            SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
            params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
-            LOG_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
+            SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
        }

        {
            auto it = data.find("chat_format");
            if (it != data.end()) {
                params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
-                LOG_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
+                SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
            } else {
                params.oaicompat_chat_format = defaults.oaicompat_chat_format;
            }
@ -367,12 +367,12 @@ struct server_task {

                    auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
                    if (ids.size() == 1) {
-                        LOG_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
+                        SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
                        params.sampling.grammar_trigger_tokens.push_back(ids[0]);
                        params.sampling.preserved_tokens.insert(ids[0]);
                        continue;
                    }
-                    LOG_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
+                    SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
                    params.sampling.grammar_trigger_words.push_back(trigger);
                }
            }
@ -381,11 +381,11 @@ struct server_task {
                for (const auto & t : *preserved_tokens) {
                    auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
                    if (ids.size() == 1) {
-                        LOG_DBG("Preserved token: %d\n", ids[0]);
+                        SRV_DBG("Preserved token: %d\n", ids[0]);
                        params.sampling.preserved_tokens.insert(ids[0]);
                    } else {
                        // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
-                        LOG_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
+                        SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
                    }
                }
            }
@ -717,7 +717,7 @@ struct server_task_result_cmpl_final : server_task_result {
        std::string finish_reason = "length";
        common_chat_msg msg;
        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
-            LOG_DBG("Parsing chat message: %s\n", content.c_str());
+            SRV_DBG("Parsing chat message: %s\n", content.c_str());
            msg = common_chat_parse(content, oaicompat_chat_format);
            finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
        } else {
@ -1885,7 +1885,7 @@ struct server_context {
        }

        if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
-            LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
+            SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
            chat_templates = common_chat_templates_from_model(model, "chatml");
        } else {
            chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
@ -3355,10 +3355,10 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp

    // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch

-    LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
+    SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);

-    LOG_DBG("request:  %s\n", req.body.c_str());
-    LOG_DBG("response: %s\n", res.body.c_str());
+    SRV_DBG("request:  %s\n", req.body.c_str());
+    SRV_DBG("response: %s\n", res.body.c_str());
 }

 std::function<void(int)> shutdown_handler;
@ -3860,7 +3860,9 @@ int main(int argc, char ** argv) {

        try {
            const auto & prompt = data.at("prompt");
-            LOG_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
+            // TODO: this log can become very long, put it behind a flag or think about a more compact format
+            //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
+
            std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
            tasks.reserve(tokenized_prompts.size());
            for (size_t i = 0; i < tokenized_prompts.size(); i++) {
@ -4376,6 +4378,9 @@ int main(int argc, char ** argv) {
                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
                } else {
                    res.set_header("Content-Encoding", "gzip");
+                    // COEP and COOP headers, required by pyodide (python interpreter)
+                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
                    res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
                }
                return false;
--- a/examples/server/webui/package-lock.json
+++ b/examples/server/webui/package-lock.json
@ -8,6 +8,7 @@
      "name": "webui",
      "version": "0.0.0",
      "dependencies": {
+        "@heroicons/react": "^2.2.0",
        "@sec-ant/readable-stream": "^0.6.0",
        "@vscode/markdown-it-katex": "^1.1.1",
        "autoprefixer": "^10.4.20",
@ -902,6 +903,15 @@
        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
      }
    },
+    "node_modules/@heroicons/react": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/@heroicons/react/-/react-2.2.0.tgz",
+      "integrity": "sha512-LMcepvRaS9LYHJGsF0zzmgKCUim/X3N/DQKc4jepAXJ7l8QxJ1PmxJzqplF2Z3FE4PqBAIGyJAQ/w4B5dsqbtQ==",
+      "license": "MIT",
+      "peerDependencies": {
+        "react": ">= 16 || ^19.0.0-rc"
+      }
+    },
    "node_modules/@humanfs/core": {
      "version": "0.19.1",
      "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz",
--- a/examples/server/webui/package.json
+++ b/examples/server/webui/package.json
@ -11,6 +11,7 @@
    "preview": "vite preview"
  },
  "dependencies": {
+    "@heroicons/react": "^2.2.0",
    "@sec-ant/readable-stream": "^0.6.0",
    "@vscode/markdown-it-katex": "^1.1.1",
    "autoprefixer": "^10.4.20",
--- a/examples/server/webui/src/App.tsx
+++ b/examples/server/webui/src/App.tsx
@ -1,8 +1,9 @@
 import { HashRouter, Outlet, Route, Routes } from 'react-router';
 import Header from './components/Header';
 import Sidebar from './components/Sidebar';
-import { AppContextProvider } from './utils/app.context';
+import { AppContextProvider, useAppContext } from './utils/app.context';
 import ChatScreen from './components/ChatScreen';
+import SettingDialog from './components/SettingDialog';

 function App() {
  return (
@ -22,13 +23,23 @@ function App() {
 }

 function AppLayout() {
+  const { showSettings, setShowSettings } = useAppContext();
  return (
    <>
      <Sidebar />
-      <div className="chat-screen drawer-content grow flex flex-col h-screen w-screen mx-auto px-4">
+      <div
+        className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto"
+        id="main-scroll"
+      >
        <Header />
        <Outlet />
      </div>
+      {
+        <SettingDialog
+          show={showSettings}
+          onClose={() => setShowSettings(false)}
+        />
+      }
    </>
  );
 }
--- a/examples/server/webui/src/Config.ts
+++ b/examples/server/webui/src/Config.ts
@ -10,6 +10,7 @@ export const BASE_URL = new URL('.', document.baseURI).href

 export const CONFIG_DEFAULT = {
  // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
+  // Do not use nested objects, keep it single level. Prefix the key if you need to group them.
  apiKey: '',
  systemMessage: 'You are a helpful assistant.',
  showTokensPerSecond: false,
@ -36,6 +37,8 @@ export const CONFIG_DEFAULT = {
  dry_penalty_last_n: -1,
  max_tokens: -1,
  custom: '', // custom json-stringified object
+  // experimental features
+  pyIntepreterEnabled: false,
 };
 export const CONFIG_INFO: Record<string, string> = {
  apiKey: 'Set the API Key if you are using --api-key option for the server.',
--- a/examples/server/webui/src/components/CanvasPyInterpreter.tsx
+++ b/examples/server/webui/src/components/CanvasPyInterpreter.tsx
@ -0,0 +1,195 @@
+import { useEffect, useState } from 'react';
+import { useAppContext } from '../utils/app.context';
+import { OpenInNewTab, XCloseButton } from '../utils/common';
+import { CanvasType } from '../utils/types';
+import { PlayIcon, StopIcon } from '@heroicons/react/24/outline';
+import StorageUtils from '../utils/storage';
+
+const canInterrupt = typeof SharedArrayBuffer === 'function';
+
+// adapted from https://pyodide.org/en/stable/usage/webworker.html
+const WORKER_CODE = `
+importScripts("https://cdn.jsdelivr.net/pyodide/v0.27.2/full/pyodide.js");
+
+let stdOutAndErr = [];
+
+let pyodideReadyPromise = loadPyodide({
+  stdout: (data) => stdOutAndErr.push(data),
+  stderr: (data) => stdOutAndErr.push(data),
+});
+
+let alreadySetBuff = false;
+
+self.onmessage = async (event) => {
+  stdOutAndErr = [];
+
+  // make sure loading is done
+  const pyodide = await pyodideReadyPromise;
+  const { id, python, context, interruptBuffer } = event.data;
+
+  if (interruptBuffer && !alreadySetBuff) {
+    pyodide.setInterruptBuffer(interruptBuffer);
+    alreadySetBuff = true;
+  }
+
+  // Now load any packages we need, run the code, and send the result back.
+  await pyodide.loadPackagesFromImports(python);
+
+  // make a Python dictionary with the data from content
+  const dict = pyodide.globals.get("dict");
+  const globals = dict(Object.entries(context));
+  try {
+    self.postMessage({ id, running: true });
+    // Execute the python code in this context
+    const result = pyodide.runPython(python, { globals });
+    self.postMessage({ result, id, stdOutAndErr });
+  } catch (error) {
+    self.postMessage({ error: error.message, id });
+  }
+  interruptBuffer[0] = 0;
+};
+`;
+
+let worker: Worker;
+const interruptBuffer = canInterrupt
+  ? new Uint8Array(new SharedArrayBuffer(1))
+  : null;
+
+const startWorker = () => {
+  if (!worker) {
+    worker = new Worker(
+      URL.createObjectURL(new Blob([WORKER_CODE], { type: 'text/javascript' }))
+    );
+  }
+};
+
+if (StorageUtils.getConfig().pyIntepreterEnabled) {
+  startWorker();
+}
+
+const runCodeInWorker = (
+  pyCode: string,
+  callbackRunning: () => void
+): {
+  donePromise: Promise<string>;
+  interrupt: () => void;
+} => {
+  startWorker();
+  const id = Math.random() * 1e8;
+  const context = {};
+  if (interruptBuffer) {
+    interruptBuffer[0] = 0;
+  }
+
+  const donePromise = new Promise<string>((resolve) => {
+    worker.onmessage = (event) => {
+      const { error, stdOutAndErr, running } = event.data;
+      if (id !== event.data.id) return;
+      if (running) {
+        callbackRunning();
+        return;
+      } else if (error) {
+        resolve(error.toString());
+      } else {
+        resolve(stdOutAndErr.join('\n'));
+      }
+    };
+    worker.postMessage({ id, python: pyCode, context, interruptBuffer });
+  });
+
+  const interrupt = () => {
+    console.log('Interrupting...');
+    console.trace();
+    if (interruptBuffer) {
+      interruptBuffer[0] = 2;
+    }
+  };
+
+  return { donePromise, interrupt };
+};
+
+export default function CanvasPyInterpreter() {
+  const { canvasData, setCanvasData } = useAppContext();
+
+  const [code, setCode] = useState(canvasData?.content ?? ''); // copy to avoid direct mutation
+  const [running, setRunning] = useState(false);
+  const [output, setOutput] = useState('');
+  const [interruptFn, setInterruptFn] = useState<() => void>();
+  const [showStopBtn, setShowStopBtn] = useState(false);
+
+  const runCode = async (pycode: string) => {
+    interruptFn?.();
+    setRunning(true);
+    setOutput('Loading Pyodide...');
+    const { donePromise, interrupt } = runCodeInWorker(pycode, () => {
+      setOutput('Running...');
+      setShowStopBtn(canInterrupt);
+    });
+    setInterruptFn(() => interrupt);
+    const out = await donePromise;
+    setOutput(out);
+    setRunning(false);
+    setShowStopBtn(false);
+  };
+
+  // run code on mount
+  useEffect(() => {
+    setCode(canvasData?.content ?? '');
+    runCode(canvasData?.content ?? '');
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [canvasData?.content]);
+
+  if (canvasData?.type !== CanvasType.PY_INTERPRETER) {
+    return null;
+  }
+
+  return (
+    <div className="card bg-base-200 w-full h-full shadow-xl">
+      <div className="card-body">
+        <div className="flex justify-between items-center mb-4">
+          <span className="text-lg font-bold">Python Interpreter</span>
+          <XCloseButton
+            className="bg-base-100"
+            onClick={() => setCanvasData(null)}
+          />
+        </div>
+        <div className="grid grid-rows-3 gap-4 h-full">
+          <textarea
+            className="textarea textarea-bordered w-full h-full font-mono"
+            value={code}
+            onChange={(e) => setCode(e.target.value)}
+          ></textarea>
+          <div className="font-mono flex flex-col row-span-2">
+            <div className="flex items-center mb-2">
+              <button
+                className="btn btn-sm bg-base-100"
+                onClick={() => runCode(code)}
+                disabled={running}
+              >
+                <PlayIcon className="h-6 w-6" /> Run
+              </button>
+              {showStopBtn && (
+                <button
+                  className="btn btn-sm bg-base-100 ml-2"
+                  onClick={() => interruptFn?.()}
+                >
+                  <StopIcon className="h-6 w-6" /> Stop
+                </button>
+              )}
+              <span className="grow text-right text-xs">
+                <OpenInNewTab href="https://github.com/ggerganov/llama.cpp/issues/11762">
+                  Report a bug
+                </OpenInNewTab>
+              </span>
+            </div>
+            <textarea
+              className="textarea textarea-bordered h-full dark-color"
+              value={output}
+              readOnly
+            ></textarea>
+          </div>
+        </div>
+      </div>
+    </div>
+  );
+}
--- a/examples/server/webui/src/components/ChatMessage.tsx
+++ b/examples/server/webui/src/components/ChatMessage.tsx
@ -92,7 +92,7 @@ export default function ChatMessage({
            <>
              <textarea
                dir="auto"
-                className="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
+                className="textarea textarea-bordered bg-base-100 text-base-content max-w-2xl w-[calc(90vw-8em)] h-24"
                value={editingContent}
                onChange={(e) => setEditingContent(e.target.value)}
              ></textarea>
@ -149,11 +149,17 @@ export default function ChatMessage({
                          )}
                        </summary>
                        <div className="collapse-content">
-                          <MarkdownDisplay content={thought} />
+                          <MarkdownDisplay
+                            content={thought}
+                            isGenerating={isPending}
+                          />
                        </div>
                      </details>
                    )}
-                    <MarkdownDisplay content={content} />
+                    <MarkdownDisplay
+                      content={content}
+                      isGenerating={isPending}
+                    />
                  </div>
                </>
              )}
--- a/examples/server/webui/src/components/ChatScreen.tsx
+++ b/examples/server/webui/src/components/ChatScreen.tsx
@ -1,9 +1,11 @@
-import { useEffect, useRef, useState } from 'react';
+import { useEffect, useState } from 'react';
 import { useAppContext } from '../utils/app.context';
 import StorageUtils from '../utils/storage';
 import { useNavigate } from 'react-router';
 import ChatMessage from './ChatMessage';
-import { PendingMessage } from '../utils/types';
+import { CanvasType, PendingMessage } from '../utils/types';
+import { classNames } from '../utils/misc';
+import CanvasPyInterpreter from './CanvasPyInterpreter';

 export default function ChatScreen() {
  const {
@ -12,24 +14,24 @@ export default function ChatScreen() {
    isGenerating,
    stopGenerating,
    pendingMessages,
+    canvasData,
  } = useAppContext();
  const [inputMsg, setInputMsg] = useState('');
-  const containerRef = useRef<HTMLDivElement>(null);
  const navigate = useNavigate();

  const currConvId = viewingConversation?.id ?? '';
  const pendingMsg: PendingMessage | undefined = pendingMessages[currConvId];

  const scrollToBottom = (requiresNearBottom: boolean) => {
-    if (!containerRef.current) return;
-    const msgListElem = containerRef.current;
+    const mainScrollElem = document.getElementById('main-scroll');
+    if (!mainScrollElem) return;
    const spaceToBottom =
-      msgListElem.scrollHeight -
-      msgListElem.scrollTop -
-      msgListElem.clientHeight;
+      mainScrollElem.scrollHeight -
+      mainScrollElem.scrollTop -
+      mainScrollElem.clientHeight;
    if (!requiresNearBottom || spaceToBottom < 50) {
      setTimeout(
-        () => msgListElem.scrollTo({ top: msgListElem.scrollHeight }),
+        () => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }),
        1
      );
    }
@ -58,66 +60,87 @@ export default function ChatScreen() {
    }
  };

+  const hasCanvas = !!canvasData;
+
  return (
-    <>
-      {/* chat messages */}
+    <div
+      className={classNames({
+        'grid lg:gap-8 grow transition-[300ms]': true,
+        'grid-cols-[1fr_0fr] lg:grid-cols-[1fr_1fr]': hasCanvas, // adapted for mobile
+        'grid-cols-[1fr_0fr]': !hasCanvas,
+      })}
+    >
      <div
-        id="messages-list"
-        className="flex flex-col grow overflow-y-auto"
-        ref={containerRef}
+        className={classNames({
+          'flex flex-col w-full max-w-[900px] mx-auto': true,
+          'hidden lg:flex': hasCanvas, // adapted for mobile
+          flex: !hasCanvas,
+        })}
      >
-        <div className="mt-auto flex justify-center">
-          {/* placeholder to shift the message to the bottom */}
-          {viewingConversation ? '' : 'Send a message to start'}
+        {/* chat messages */}
+        <div id="messages-list" className="grow">
+          <div className="mt-auto flex justify-center">
+            {/* placeholder to shift the message to the bottom */}
+            {viewingConversation ? '' : 'Send a message to start'}
+          </div>
+          {viewingConversation?.messages.map((msg) => (
+            <ChatMessage
+              key={msg.id}
+              msg={msg}
+              scrollToBottom={scrollToBottom}
+            />
+          ))}
+
+          {pendingMsg && (
+            <ChatMessage
+              msg={pendingMsg}
+              scrollToBottom={scrollToBottom}
+              isPending
+              id="pending-msg"
+            />
+          )}
        </div>
-        {viewingConversation?.messages.map((msg) => (
-          <ChatMessage key={msg.id} msg={msg} scrollToBottom={scrollToBottom} />
-        ))}

-        {pendingMsg && (
-          <ChatMessage
-            msg={pendingMsg}
-            scrollToBottom={scrollToBottom}
-            isPending
-            id="pending-msg"
-          />
+        {/* chat input */}
+        <div className="flex flex-row items-center pt-8 pb-6 sticky bottom-0 bg-base-100">
+          <textarea
+            className="textarea textarea-bordered w-full"
+            placeholder="Type a message (Shift+Enter to add a new line)"
+            value={inputMsg}
+            onChange={(e) => setInputMsg(e.target.value)}
+            onKeyDown={(e) => {
+              if (e.key === 'Enter' && e.shiftKey) return;
+              if (e.key === 'Enter' && !e.shiftKey) {
+                e.preventDefault();
+                sendNewMessage();
+              }
+            }}
+            id="msg-input"
+            dir="auto"
+          ></textarea>
+          {isGenerating(currConvId) ? (
+            <button
+              className="btn btn-neutral ml-2"
+              onClick={() => stopGenerating(currConvId)}
+            >
+              Stop
+            </button>
+          ) : (
+            <button
+              className="btn btn-primary ml-2"
+              onClick={sendNewMessage}
+              disabled={inputMsg.trim().length === 0}
+            >
+              Send
+            </button>
+          )}
+        </div>
+      </div>
+      <div className="w-full sticky top-[7em] h-[calc(100vh-9em)]">
+        {canvasData?.type === CanvasType.PY_INTERPRETER && (
+          <CanvasPyInterpreter />
        )}
      </div>
-
-      {/* chat input */}
-      <div className="flex flex-row items-center mt-8 mb-6">
-        <textarea
-          className="textarea textarea-bordered w-full"
-          placeholder="Type a message (Shift+Enter to add a new line)"
-          value={inputMsg}
-          onChange={(e) => setInputMsg(e.target.value)}
-          onKeyDown={(e) => {
-            if (e.key === 'Enter' && e.shiftKey) return;
-            if (e.key === 'Enter' && !e.shiftKey) {
-              e.preventDefault();
-              sendNewMessage();
-            }
-          }}
-          id="msg-input"
-          dir="auto"
-        ></textarea>
-        {isGenerating(currConvId) ? (
-          <button
-            className="btn btn-neutral ml-2"
-            onClick={() => stopGenerating(currConvId)}
-          >
-            Stop
-          </button>
-        ) : (
-          <button
-            className="btn btn-primary ml-2"
-            onClick={sendNewMessage}
-            disabled={inputMsg.trim().length === 0}
-          >
-            Send
-          </button>
-        )}
-      </div>
-    </>
+    </div>
  );
 }
--- a/examples/server/webui/src/components/Header.tsx
+++ b/examples/server/webui/src/components/Header.tsx
@ -5,12 +5,11 @@ import { classNames } from '../utils/misc';
 import daisyuiThemes from 'daisyui/src/theming/themes';
 import { THEMES } from '../Config';
 import { useNavigate } from 'react-router';
-import SettingDialog from './SettingDialog';

 export default function Header() {
  const navigate = useNavigate();
  const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme());
-  const [showSettingDialog, setShowSettingDialog] = useState(false);
+  const { setShowSettings } = useAppContext();

  const setTheme = (theme: string) => {
    StorageUtils.setTheme(theme);
@ -54,7 +53,7 @@ export default function Header() {
  };

  return (
-    <div className="flex flex-row items-center mt-6 mb-6">
+    <div className="flex flex-row items-center pt-6 pb-6 sticky top-0 z-10 bg-base-100">
      {/* open sidebar button */}
      <label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
        <svg
@ -109,7 +108,7 @@ export default function Header() {
          </ul>
        </div>
        <div className="tooltip tooltip-bottom" data-tip="Settings">
-          <button className="btn" onClick={() => setShowSettingDialog(true)}>
+          <button className="btn" onClick={() => setShowSettings(true)}>
            {/* settings button */}
            <svg
              xmlns="http://www.w3.org/2000/svg"
@ -172,11 +171,6 @@ export default function Header() {
          </div>
        </div>
      </div>
-
-      <SettingDialog
-        show={showSettingDialog}
-        onClose={() => setShowSettingDialog(false)}
-      />
    </div>
  );
 }
--- a/examples/server/webui/src/components/MarkdownDisplay.tsx
+++ b/examples/server/webui/src/components/MarkdownDisplay.tsx
@ -9,8 +9,16 @@ import 'katex/dist/katex.min.css';
 import { classNames, copyStr } from '../utils/misc';
 import { ElementContent, Root } from 'hast';
 import { visit } from 'unist-util-visit';
+import { useAppContext } from '../utils/app.context';
+import { CanvasType } from '../utils/types';

-export default function MarkdownDisplay({ content }: { content: string }) {
+export default function MarkdownDisplay({
+  content,
+  isGenerating,
+}: {
+  content: string;
+  isGenerating?: boolean;
+}) {
  const preprocessedContent = useMemo(
    () => preprocessLaTeX(content),
    [content]
@ -21,8 +29,13 @@ export default function MarkdownDisplay({ content }: { content: string }) {
      rehypePlugins={[rehypeHightlight, rehypeKatex, rehypeCustomCopyButton]}
      components={{
        button: (props) => (
-          <CopyCodeButton {...props} origContent={preprocessedContent} />
+          <CodeBlockButtons
+            {...props}
+            isGenerating={isGenerating}
+            origContent={preprocessedContent}
+          />
        ),
+        // note: do not use "pre", "p" or other basic html elements here, it will cause the node to re-render when the message is being generated (this should be a bug with react-markdown, not sure how to fix it)
      }}
    >
      {preprocessedContent}
@ -30,11 +43,12 @@ export default function MarkdownDisplay({ content }: { content: string }) {
  );
 }

-const CopyCodeButton: React.ElementType<
+const CodeBlockButtons: React.ElementType<
  React.ClassAttributes<HTMLButtonElement> &
    React.HTMLAttributes<HTMLButtonElement> &
-    ExtraProps & { origContent: string }
-> = ({ node, origContent }) => {
+    ExtraProps & { origContent: string; isGenerating?: boolean }
+> = ({ node, origContent, isGenerating }) => {
+  const { config } = useAppContext();
  const startOffset = node?.position?.start.offset ?? 0;
  const endOffset = node?.position?.end.offset ?? 0;

@ -47,14 +61,33 @@ const CopyCodeButton: React.ElementType<
    [origContent, startOffset, endOffset]
  );

+  const codeLanguage = useMemo(
+    () =>
+      origContent
+        .substring(startOffset, startOffset + 10)
+        .match(/^```([^\n]+)\n/)?.[1] ?? '',
+    [origContent, startOffset]
+  );
+
+  const canRunCode =
+    !isGenerating &&
+    config.pyIntepreterEnabled &&
+    codeLanguage.startsWith('py');
+
  return (
    <div
      className={classNames({
-        'text-right sticky top-4 mb-2 mr-2 h-0': true,
+        'text-right sticky top-[7em] mb-2 mr-2 h-0': true,
        'display-none': !node?.position,
      })}
    >
      <CopyButton className="badge btn-mini" content={copiedContent} />
+      {canRunCode && (
+        <RunPyCodeButton
+          className="badge btn-mini ml-2"
+          content={copiedContent}
+        />
+      )}
    </div>
  );
 };
@ -81,6 +114,31 @@ export const CopyButton = ({
  );
 };

+export const RunPyCodeButton = ({
+  content,
+  className,
+}: {
+  content: string;
+  className?: string;
+}) => {
+  const { setCanvasData } = useAppContext();
+  return (
+    <>
+      <button
+        className={className}
+        onClick={() =>
+          setCanvasData({
+            type: CanvasType.PY_INTERPRETER,
+            content,
+          })
+        }
+      >
+        ▶️ Run
+      </button>
+    </>
+  );
+};
+
 /**
 * This injects the "button" element before each "pre" element.
 * The actual button will be replaced with a react component in the MarkdownDisplay.
@ -94,9 +152,7 @@ function rehypeCustomCopyButton() {
        // replace current node
        preNode.properties.visited = 'true';
        node.tagName = 'div';
-        node.properties = {
-          className: 'relative my-4',
-        };
+        node.properties = {};
        // add node for button
        const btnNode: ElementContent = {
          type: 'element',
--- a/examples/server/webui/src/components/SettingDialog.tsx
+++ b/examples/server/webui/src/components/SettingDialog.tsx
@ -3,17 +3,27 @@ import { useAppContext } from '../utils/app.context';
 import { CONFIG_DEFAULT, CONFIG_INFO } from '../Config';
 import { isDev } from '../Config';
 import StorageUtils from '../utils/storage';
+import { classNames, isBoolean, isNumeric, isString } from '../utils/misc';
+import {
+  BeakerIcon,
+  ChatBubbleOvalLeftEllipsisIcon,
+  Cog6ToothIcon,
+  FunnelIcon,
+  HandRaisedIcon,
+  SquaresPlusIcon,
+} from '@heroicons/react/24/outline';
+import { OpenInNewTab } from '../utils/common';

 type SettKey = keyof typeof CONFIG_DEFAULT;

-const COMMON_SAMPLER_KEYS: SettKey[] = [
+const BASIC_KEYS: SettKey[] = [
  'temperature',
  'top_k',
  'top_p',
  'min_p',
  'max_tokens',
 ];
-const OTHER_SAMPLER_KEYS: SettKey[] = [
+const SAMPLER_KEYS: SettKey[] = [
  'dynatemp_range',
  'dynatemp_exponent',
  'typical_p',
@ -31,6 +41,223 @@ const PENALTY_KEYS: SettKey[] = [
  'dry_penalty_last_n',
 ];

+enum SettingInputType {
+  SHORT_INPUT,
+  LONG_INPUT,
+  CHECKBOX,
+  CUSTOM,
+}
+
+interface SettingFieldInput {
+  type: Exclude<SettingInputType, SettingInputType.CUSTOM>;
+  label: string | React.ReactElement;
+  help?: string | React.ReactElement;
+  key: SettKey;
+}
+
+interface SettingFieldCustom {
+  type: SettingInputType.CUSTOM;
+  key: SettKey;
+  component:
+    | string
+    | React.FC<{
+        value: string | boolean | number;
+        onChange: (value: string) => void;
+      }>;
+}
+
+interface SettingSection {
+  title: React.ReactElement;
+  fields: (SettingFieldInput | SettingFieldCustom)[];
+}
+
+const ICON_CLASSNAME = 'w-4 h-4 mr-1 inline';
+
+const SETTING_SECTIONS: SettingSection[] = [
+  {
+    title: (
+      <>
+        <Cog6ToothIcon className={ICON_CLASSNAME} />
+        General
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.SHORT_INPUT,
+        label: 'API Key',
+        key: 'apiKey',
+      },
+      {
+        type: SettingInputType.LONG_INPUT,
+        label: 'System Message (will be disabled if left empty)',
+        key: 'systemMessage',
+      },
+      ...BASIC_KEYS.map(
+        (key) =>
+          ({
+            type: SettingInputType.SHORT_INPUT,
+            label: key,
+            key,
+          }) as SettingFieldInput
+      ),
+    ],
+  },
+  {
+    title: (
+      <>
+        <FunnelIcon className={ICON_CLASSNAME} />
+        Samplers
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.SHORT_INPUT,
+        label: 'Samplers queue',
+        key: 'samplers',
+      },
+      ...SAMPLER_KEYS.map(
+        (key) =>
+          ({
+            type: SettingInputType.SHORT_INPUT,
+            label: key,
+            key,
+          }) as SettingFieldInput
+      ),
+    ],
+  },
+  {
+    title: (
+      <>
+        <HandRaisedIcon className={ICON_CLASSNAME} />
+        Penalties
+      </>
+    ),
+    fields: PENALTY_KEYS.map((key) => ({
+      type: SettingInputType.SHORT_INPUT,
+      label: key,
+      key,
+    })),
+  },
+  {
+    title: (
+      <>
+        <ChatBubbleOvalLeftEllipsisIcon className={ICON_CLASSNAME} />
+        Reasoning
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.CHECKBOX,
+        label: 'Expand though process by default for generating message',
+        key: 'showThoughtInProgress',
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label:
+          'Exclude thought process when sending request to API (Recommended for DeepSeek-R1)',
+        key: 'excludeThoughtOnReq',
+      },
+    ],
+  },
+  {
+    title: (
+      <>
+        <SquaresPlusIcon className={ICON_CLASSNAME} />
+        Advanced
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.CUSTOM,
+        key: 'custom', // dummy key, won't be used
+        component: () => {
+          const debugImportDemoConv = async () => {
+            const res = await fetch('/demo-conversation.json');
+            const demoConv = await res.json();
+            StorageUtils.remove(demoConv.id);
+            for (const msg of demoConv.messages) {
+              StorageUtils.appendMsg(demoConv.id, msg);
+            }
+          };
+          return (
+            <button className="btn" onClick={debugImportDemoConv}>
+              (debug) Import demo conversation
+            </button>
+          );
+        },
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label: 'Show tokens per second',
+        key: 'showTokensPerSecond',
+      },
+      {
+        type: SettingInputType.LONG_INPUT,
+        label: (
+          <>
+            Custom JSON config (For more info, refer to{' '}
+            <OpenInNewTab href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md">
+              server documentation
+            </OpenInNewTab>
+            )
+          </>
+        ),
+        key: 'custom',
+      },
+    ],
+  },
+  {
+    title: (
+      <>
+        <BeakerIcon className={ICON_CLASSNAME} />
+        Experimental
+      </>
+    ),
+    fields: [
+      {
+        type: SettingInputType.CUSTOM,
+        key: 'custom', // dummy key, won't be used
+        component: () => (
+          <>
+            <p className="mb-8">
+              Experimental features are not guaranteed to work correctly.
+              <br />
+              <br />
+              If you encounter any problems, create a{' '}
+              <OpenInNewTab href="https://github.com/ggerganov/llama.cpp/issues/new?template=019-bug-misc.yml">
+                Bug (misc.)
+              </OpenInNewTab>{' '}
+              report on Github. Please also specify <b>webui/experimental</b> on
+              the report title and include screenshots.
+              <br />
+              <br />
+              Some features may require packages downloaded from CDN, so they
+              need internet connection.
+            </p>
+          </>
+        ),
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label: (
+          <>
+            <b>Enable Python interpreter</b>
+            <br />
+            <small className="text-xs">
+              This feature uses{' '}
+              <OpenInNewTab href="https://pyodide.org">pyodide</OpenInNewTab>,
+              downloaded from CDN. To use this feature, ask the LLM to generate
+              python code inside a markdown code block. You will see a "Run"
+              button on the code block, near the "Copy" button.
+            </small>
+          </>
+        ),
+        key: 'pyIntepreterEnabled',
+      },
+    ],
+  },
+];
+
 export default function SettingDialog({
  show,
  onClose,
@ -39,6 +266,7 @@ export default function SettingDialog({
  onClose: () => void;
 }) {
  const { config, saveConfig } = useAppContext();
+  const [sectionIdx, setSectionIdx] = useState(0);

  // clone the config object to prevent direct mutation
  const [localConfig, setLocalConfig] = useState<typeof CONFIG_DEFAULT>(
@ -52,206 +280,148 @@ export default function SettingDialog({
  };

  const handleSave = () => {
-    saveConfig(localConfig);
+    // copy the local config to prevent direct mutation
+    const newConfig: typeof CONFIG_DEFAULT = JSON.parse(
+      JSON.stringify(localConfig)
+    );
+    // validate the config
+    for (const key in newConfig) {
+      const value = newConfig[key as SettKey];
+      const mustBeBoolean = isBoolean(CONFIG_DEFAULT[key as SettKey]);
+      const mustBeString = isString(CONFIG_DEFAULT[key as SettKey]);
+      const mustBeNumeric = isNumeric(CONFIG_DEFAULT[key as SettKey]);
+      if (mustBeString) {
+        if (!isString(value)) {
+          alert(`Value for ${key} must be string`);
+          return;
+        }
+      } else if (mustBeNumeric) {
+        const trimedValue = value.toString().trim();
+        const numVal = Number(trimedValue);
+        if (isNaN(numVal) || !isNumeric(numVal) || trimedValue.length === 0) {
+          alert(`Value for ${key} must be numeric`);
+          return;
+        }
+        // force conversion to number
+        // @ts-expect-error this is safe
+        newConfig[key] = numVal;
+      } else if (mustBeBoolean) {
+        if (!isBoolean(value)) {
+          alert(`Value for ${key} must be boolean`);
+          return;
+        }
+      } else {
+        console.error(`Unknown default type for key ${key}`);
+      }
+    }
+    if (isDev) console.log('Saving config', newConfig);
+    saveConfig(newConfig);
    onClose();
  };

-  const debugImportDemoConv = async () => {
-    const res = await fetch('/demo-conversation.json');
-    const demoConv = await res.json();
-    StorageUtils.remove(demoConv.id);
-    for (const msg of demoConv.messages) {
-      StorageUtils.appendMsg(demoConv.id, msg);
-    }
-    onClose();
+  const onChange = (key: SettKey) => (value: string | boolean) => {
+    // note: we do not perform validation here, because we may get incomplete value as user is still typing it
+    setLocalConfig({ ...localConfig, [key]: value });
  };

  return (
-    <dialog className={`modal ${show ? 'modal-open' : ''}`}>
-      <div className="modal-box">
+    <dialog className={classNames({ modal: true, 'modal-open': show })}>
+      <div className="modal-box w-11/12 max-w-3xl">
        <h3 className="text-lg font-bold mb-6">Settings</h3>
-        <div className="h-[calc(90vh-12rem)] overflow-y-auto">
-          <p className="opacity-40 mb-6">
-            Settings below are saved in browser's localStorage
-          </p>
-
-          <SettingsModalShortInput
-            configKey="apiKey"
-            configDefault={CONFIG_DEFAULT}
-            value={localConfig.apiKey}
-            onChange={(value) =>
-              setLocalConfig({ ...localConfig, apiKey: value })
-            }
-          />
-
-          <label className="form-control mb-2">
-            <div className="label">
-              System Message (will be disabled if left empty)
-            </div>
-            <textarea
-              className="textarea textarea-bordered h-24"
-              placeholder={`Default: ${CONFIG_DEFAULT.systemMessage}`}
-              value={localConfig.systemMessage}
-              onChange={(e) =>
-                setLocalConfig({
-                  ...localConfig,
-                  systemMessage: e.target.value,
-                })
-              }
-            />
-          </label>
-
-          {COMMON_SAMPLER_KEYS.map((key) => (
-            <SettingsModalShortInput
-              key={key}
-              configKey={key}
-              configDefault={CONFIG_DEFAULT}
-              value={localConfig[key]}
-              onChange={(value) =>
-                setLocalConfig({ ...localConfig, [key]: value })
-              }
-            />
-          ))}
-
-          <details className="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary className="collapse-title font-bold">
-              Other sampler settings
-            </summary>
-            <div className="collapse-content">
-              <SettingsModalShortInput
-                label="Samplers queue"
-                configKey="samplers"
-                configDefault={CONFIG_DEFAULT}
-                value={localConfig.samplers}
-                onChange={(value) =>
-                  setLocalConfig({ ...localConfig, samplers: value })
-                }
-              />
-              {OTHER_SAMPLER_KEYS.map((key) => (
-                <SettingsModalShortInput
-                  key={key}
-                  configKey={key}
-                  configDefault={CONFIG_DEFAULT}
-                  value={localConfig[key]}
-                  onChange={(value) =>
-                    setLocalConfig({ ...localConfig, [key]: value })
-                  }
-                />
-              ))}
-            </div>
-          </details>
-
-          <details className="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary className="collapse-title font-bold">
-              Penalties settings
-            </summary>
-            <div className="collapse-content">
-              {PENALTY_KEYS.map((key) => (
-                <SettingsModalShortInput
-                  key={key}
-                  configKey={key}
-                  configDefault={CONFIG_DEFAULT}
-                  value={localConfig[key]}
-                  onChange={(value) =>
-                    setLocalConfig({ ...localConfig, [key]: value })
-                  }
-                />
-              ))}
-            </div>
-          </details>
-
-          <details className="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary className="collapse-title font-bold">
-              Reasoning models
-            </summary>
-            <div className="collapse-content">
-              <div className="flex flex-row items-center mb-2">
-                <input
-                  type="checkbox"
-                  className="checkbox"
-                  checked={localConfig.showThoughtInProgress}
-                  onChange={(e) =>
-                    setLocalConfig({
-                      ...localConfig,
-                      showThoughtInProgress: e.target.checked,
-                    })
-                  }
-                />
-                <span className="ml-4">
-                  Expand though process by default for generating message
-                </span>
+        <div className="flex flex-col md:flex-row h-[calc(90vh-12rem)]">
+          {/* Left panel, showing sections - Desktop version */}
+          <div className="hidden md:flex flex-col items-stretch pr-4 mr-4 border-r-2 border-base-200">
+            {SETTING_SECTIONS.map((section, idx) => (
+              <div
+                key={idx}
+                className={classNames({
+                  'btn btn-ghost justify-start font-normal w-44 mb-1': true,
+                  'btn-active': sectionIdx === idx,
+                })}
+                onClick={() => setSectionIdx(idx)}
+                dir="auto"
+              >
+                {section.title}
              </div>
-              <div className="flex flex-row items-center mb-2">
-                <input
-                  type="checkbox"
-                  className="checkbox"
-                  checked={localConfig.excludeThoughtOnReq}
-                  onChange={(e) =>
-                    setLocalConfig({
-                      ...localConfig,
-                      excludeThoughtOnReq: e.target.checked,
-                    })
-                  }
-                />
-                <span className="ml-4">
-                  Exclude thought process when sending request to API
-                  (Recommended for DeepSeek-R1)
-                </span>
-              </div>
-            </div>
-          </details>
+            ))}
+          </div>

-          <details className="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <summary className="collapse-title font-bold">
-              Advanced config
-            </summary>
-            <div className="collapse-content">
-              {/* this button only shows in dev mode, used to import a demo conversation to test message rendering */}
-              {isDev && (
-                <div className="flex flex-row items-center mb-2">
-                  <button className="btn" onClick={debugImportDemoConv}>
-                    (debug) Import demo conversation
-                  </button>
-                </div>
-              )}
-              <div className="flex flex-row items-center mb-2">
-                <input
-                  type="checkbox"
-                  className="checkbox"
-                  checked={localConfig.showTokensPerSecond}
-                  onChange={(e) =>
-                    setLocalConfig({
-                      ...localConfig,
-                      showTokensPerSecond: e.target.checked,
-                    })
-                  }
-                />
-                <span className="ml-4">Show tokens per second</span>
-              </div>
-              <label className="form-control mb-2">
-                <div className="label inline">
-                  Custom JSON config (For more info, refer to{' '}
-                  <a
-                    className="underline"
-                    href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md"
-                    target="_blank"
-                    rel="noopener noreferrer"
+          {/* Left panel, showing sections - Mobile version */}
+          <div className="md:hidden flex flex-row gap-2 mb-4">
+            <details className="dropdown">
+              <summary className="btn bt-sm w-full m-1">
+                {SETTING_SECTIONS[sectionIdx].title}
+              </summary>
+              <ul className="menu dropdown-content bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
+                {SETTING_SECTIONS.map((section, idx) => (
+                  <div
+                    key={idx}
+                    className={classNames({
+                      'btn btn-ghost justify-start font-normal': true,
+                      'btn-active': sectionIdx === idx,
+                    })}
+                    onClick={() => setSectionIdx(idx)}
+                    dir="auto"
                  >
-                    server documentation
-                  </a>
-                  )
-                </div>
-                <textarea
-                  className="textarea textarea-bordered h-24"
-                  placeholder='Example: { "mirostat": 1, "min_p": 0.1 }'
-                  value={localConfig.custom}
-                  onChange={(e) =>
-                    setLocalConfig({ ...localConfig, custom: e.target.value })
-                  }
-                />
-              </label>
-            </div>
-          </details>
+                    {section.title}
+                  </div>
+                ))}
+              </ul>
+            </details>
+          </div>
+
+          {/* Right panel, showing setting fields */}
+          <div className="grow overflow-y-auto px-4">
+            {SETTING_SECTIONS[sectionIdx].fields.map((field, idx) => {
+              const key = `${sectionIdx}-${idx}`;
+              if (field.type === SettingInputType.SHORT_INPUT) {
+                return (
+                  <SettingsModalShortInput
+                    key={key}
+                    configKey={field.key}
+                    value={localConfig[field.key]}
+                    onChange={onChange(field.key)}
+                    label={field.label as string}
+                  />
+                );
+              } else if (field.type === SettingInputType.LONG_INPUT) {
+                return (
+                  <SettingsModalLongInput
+                    key={key}
+                    configKey={field.key}
+                    value={localConfig[field.key].toString()}
+                    onChange={onChange(field.key)}
+                    label={field.label as string}
+                  />
+                );
+              } else if (field.type === SettingInputType.CHECKBOX) {
+                return (
+                  <SettingsModalCheckbox
+                    key={key}
+                    configKey={field.key}
+                    value={!!localConfig[field.key]}
+                    onChange={onChange(field.key)}
+                    label={field.label as string}
+                  />
+                );
+              } else if (field.type === SettingInputType.CUSTOM) {
+                return (
+                  <div key={key} className="mb-2">
+                    {typeof field.component === 'string'
+                      ? field.component
+                      : field.component({
+                          value: localConfig[field.key],
+                          onChange: onChange(field.key),
+                        })}
+                  </div>
+                );
+              }
+            })}
+
+            <p className="opacity-40 mb-6 text-sm mt-8">
+              Settings are saved in browser's localStorage
+            </p>
+          </div>
        </div>

        <div className="modal-action">
@ -270,37 +440,97 @@ export default function SettingDialog({
  );
 }

-function SettingsModalShortInput({
+function SettingsModalLongInput({
  configKey,
-  configDefault,
  value,
  onChange,
  label,
 }: {
  configKey: SettKey;
-  configDefault: typeof CONFIG_DEFAULT;
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  value: any;
+  value: string;
  onChange: (value: string) => void;
  label?: string;
 }) {
  return (
-    <label className="input input-bordered join-item grow flex items-center gap-2 mb-2">
-      <div className="dropdown dropdown-hover">
-        <div tabIndex={0} role="button" className="font-bold">
-          {label || configKey}
-        </div>
-        <div className="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
-          {CONFIG_INFO[configKey] ?? '(no help message available)'}
-        </div>
-      </div>
-      <input
-        type="text"
-        className="grow"
-        placeholder={`Default: ${configDefault[configKey] || 'none'}`}
+    <label className="form-control mb-2">
+      <div className="label inline">{label || configKey}</div>
+      <textarea
+        className="textarea textarea-bordered h-24"
+        placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
        value={value}
        onChange={(e) => onChange(e.target.value)}
      />
    </label>
  );
 }
+
+function SettingsModalShortInput({
+  configKey,
+  value,
+  onChange,
+  label,
+}: {
+  configKey: SettKey;
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  value: any;
+  onChange: (value: string) => void;
+  label?: string;
+}) {
+  const helpMsg = CONFIG_INFO[configKey];
+
+  return (
+    <>
+      {/* on mobile, we simply show the help message here */}
+      {helpMsg && (
+        <div className="block md:hidden mb-1">
+          <b>{label || configKey}</b>
+          <br />
+          <p className="text-xs">{helpMsg}</p>
+        </div>
+      )}
+      <label className="input input-bordered join-item grow flex items-center gap-2 mb-2">
+        <div className="dropdown dropdown-hover">
+          <div tabIndex={0} role="button" className="font-bold hidden md:block">
+            {label || configKey}
+          </div>
+          {helpMsg && (
+            <div className="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
+              {helpMsg}
+            </div>
+          )}
+        </div>
+        <input
+          type="text"
+          className="grow"
+          placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
+          value={value}
+          onChange={(e) => onChange(e.target.value)}
+        />
+      </label>
+    </>
+  );
+}
+
+function SettingsModalCheckbox({
+  configKey,
+  value,
+  onChange,
+  label,
+}: {
+  configKey: SettKey;
+  value: boolean;
+  onChange: (value: boolean) => void;
+  label: string;
+}) {
+  return (
+    <div className="flex flex-row items-center mb-2">
+      <input
+        type="checkbox"
+        className="toggle"
+        checked={value}
+        onChange={(e) => onChange(e.target.checked)}
+      />
+      <span className="ml-4">{label || configKey}</span>
+    </div>
+  );
+}
--- a/examples/server/webui/src/index.scss
+++ b/examples/server/webui/src/index.scss
@ -45,6 +45,9 @@
 /* Highlight.js */
 [data-color-scheme='light'] {
  @include meta.load-css('highlight.js/styles/stackoverflow-light');
+  .dark-color {
+    @apply bg-base-content text-base-100;
+  }
 }
 [data-color-scheme='dark'] {
  @include meta.load-css('highlight.js/styles/stackoverflow-dark');
@ -52,6 +55,9 @@
 [data-color-scheme='auto'] {
  @media (prefers-color-scheme: light) {
    @include meta.load-css('highlight.js/styles/stackoverflow-light');
+    .dark-color {
+      @apply bg-base-content text-base-100;
+    }
  }
  @media (prefers-color-scheme: dark) {
    @include meta.load-css('highlight.js/styles/stackoverflow-dark');
--- a/examples/server/webui/src/utils/app.context.tsx
+++ b/examples/server/webui/src/utils/app.context.tsx
@ -1,5 +1,11 @@
 import React, { createContext, useContext, useEffect, useState } from 'react';
-import { APIMessage, Conversation, Message, PendingMessage } from './types';
+import {
+  APIMessage,
+  CanvasData,
+  Conversation,
+  Message,
+  PendingMessage,
+} from './types';
 import StorageUtils from './storage';
 import {
  filterThoughtFromMsgs,
@ -10,6 +16,7 @@ import { BASE_URL, CONFIG_DEFAULT, isDev } from '../Config';
 import { matchPath, useLocation } from 'react-router';

 interface AppContextValue {
+  // conversations and messages
  viewingConversation: Conversation | null;
  pendingMessages: Record<Conversation['id'], PendingMessage>;
  isGenerating: (convId: string) => boolean;
@ -26,8 +33,15 @@ interface AppContextValue {
    onChunk?: CallbackGeneratedChunk
  ) => Promise<void>;

+  // canvas
+  canvasData: CanvasData | null;
+  setCanvasData: (data: CanvasData | null) => void;
+
+  // config
  config: typeof CONFIG_DEFAULT;
  saveConfig: (config: typeof CONFIG_DEFAULT) => void;
+  showSettings: boolean;
+  setShowSettings: (show: boolean) => void;
 }

 // for now, this callback is only used for scrolling to the bottom of the chat
@ -54,8 +68,13 @@ export const AppContextProvider = ({
    Record<Conversation['id'], AbortController>
  >({});
  const [config, setConfig] = useState(StorageUtils.getConfig());
+  const [canvasData, setCanvasData] = useState<CanvasData | null>(null);
+  const [showSettings, setShowSettings] = useState(false);

+  // handle change when the convId from URL is changed
  useEffect(() => {
+    // also reset the canvas data
+    setCanvasData(null);
    const handleConversationChange = (changedConvId: string) => {
      if (changedConvId !== convId) return;
      setViewingConversation(StorageUtils.getOneConversation(convId));
@ -292,8 +311,12 @@ export const AppContextProvider = ({
        sendMessage,
        stopGenerating,
        replaceMessageAndGenerate,
+        canvasData,
+        setCanvasData,
        config,
        saveConfig,
+        showSettings,
+        setShowSettings,
      }}
    >
      {children}
--- a/examples/server/webui/src/utils/common.tsx
+++ b/examples/server/webui/src/utils/common.tsx
@ -0,0 +1,38 @@
+export const XCloseButton: React.ElementType<
+  React.ClassAttributes<HTMLButtonElement> &
+    React.HTMLAttributes<HTMLButtonElement>
+> = ({ className, ...props }) => (
+  <button className={`btn btn-square btn-sm ${className ?? ''}`} {...props}>
+    <svg
+      xmlns="http://www.w3.org/2000/svg"
+      className="h-6 w-6"
+      fill="none"
+      viewBox="0 0 24 24"
+      stroke="currentColor"
+    >
+      <path
+        strokeLinecap="round"
+        strokeLinejoin="round"
+        strokeWidth="2"
+        d="M6 18L18 6M6 6l12 12"
+      />
+    </svg>
+  </button>
+);
+
+export const OpenInNewTab = ({
+  href,
+  children,
+}: {
+  href: string;
+  children: string;
+}) => (
+  <a
+    className="underline"
+    href={href}
+    target="_blank"
+    rel="noopener noreferrer"
+  >
+    {children}
+  </a>
+);
--- a/examples/server/webui/src/utils/misc.ts
+++ b/examples/server/webui/src/utils/misc.ts
@ -85,3 +85,6 @@ export function classNames(classes: Record<string, boolean>): string {
    .map(([key, _]) => key)
    .join(' ');
 }
+
+export const delay = (ms: number) =>
+  new Promise((resolve) => setTimeout(resolve, ms));
--- a/examples/server/webui/src/utils/types.ts
+++ b/examples/server/webui/src/utils/types.ts
@ -23,3 +23,14 @@ export interface Conversation {
 export type PendingMessage = Omit<Message, 'content'> & {
  content: string | null;
 };
+
+export enum CanvasType {
+  PY_INTERPRETER,
+}
+
+export interface CanvasPyInterpreter {
+  type: CanvasType.PY_INTERPRETER;
+  content: string;
+}
+
+export type CanvasData = CanvasPyInterpreter;
--- a/examples/server/webui/vite.config.ts
+++ b/examples/server/webui/vite.config.ts
@ -72,5 +72,9 @@ export default defineConfig({
    proxy: {
      '/v1': 'http://localhost:8080',
    },
+    headers: {
+      'Cross-Origin-Embedder-Policy': 'require-corp',
+      'Cross-Origin-Opener-Policy': 'same-origin',
+    },
  },
 });
--- a/ggml/include/ggml-vulkan.h
+++ b/ggml/include/ggml-vulkan.h
@ -10,8 +10,6 @@ extern "C" {
 #define GGML_VK_NAME "Vulkan"
 #define GGML_VK_MAX_DEVICES 16

-GGML_BACKEND_API void ggml_vk_instance_init(void);
-
 // backend API
 GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);

--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@ -360,21 +360,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
 #endif

 #if defined(__loongarch_asx)
-
-typedef union {
-    int32_t i;
-    float f;
-} ft_union;
-
 /* float type data load instructions */
-static __m128 __lsx_vreplfr2vr_s(float val) {
-    ft_union fi_tmpval = {.f = val};
-    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
+static __m128 __lsx_vreplfr2vr_s(const float val) {
+    v4f32 res = {val, val, val, val};
+    return (__m128)res;
 }

-static __m256 __lasx_xvreplfr2vr_s(float val) {
-    ft_union fi_tmpval = {.f = val};
-    return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
+static __m256 __lasx_xvreplfr2vr_s(const float val) {
+    v8f32 res = {val, val, val, val, val, val, val, val};
+    return (__m256)res;
 }
 #endif

--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@ -501,30 +501,15 @@ static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
 }

 static __m256i lasx_extu8_16(__m128i a) {
-    __m128i zero = __lsx_vldi(0);
-    __m128i vlo = __lsx_vilvl_b(zero, a);
-    __m128i vhi = __lsx_vilvh_b(zero, a);
-    return lasx_set_q(vhi, vlo);
+    return __lasx_vext2xv_hu_bu(____m256i(a));
 }

 static __m256i lasx_ext8_16(__m128i a) {
-     __m128i sign = __lsx_vslti_b(a, 0);
-     __m128i vlo = __lsx_vilvl_b(sign, a);
-     __m128i vhi = __lsx_vilvh_b(sign, a);
-     return lasx_set_q(vhi, vlo);
+    return __lasx_vext2xv_h_b(____m256i(a));
 }

 static __m256i lasx_ext16_32(__m128i a) {
-    __m256i tmp1;
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6);
-    tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7);
-    return tmp1;
+    return __lasx_vext2xv_w_h(____m256i(a));
 }

 static __m128i lasx_extracti128( __m256i a, int pos) {
@ -592,12 +577,10 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
 // horizontally add 8 floats
 static inline float hsum_float_8(const __m256 x) {
    __m128 res = lasx_extractf128(x, 1);
-    ft_union tmp;
    res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
    res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
    res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
-    tmp.i = __lsx_vpickve2gr_w(res, 0);
-    return tmp.f;
+    return ((v4f32)res)[0];
 }

 // horizontally add 8 int32_t
@ -939,7 +922,6 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)

 #elif defined(__loongarch_asx)
    for (int i = 0; i < nb; i++) {
-        ft_union fi;
        __m256 v0 = (__m256)__lasx_xvld( x , 0);
        __m256 v1 = (__m256)__lasx_xvld( x , 32);
        __m256 v2 = (__m256)__lasx_xvld( x , 64);
@ -957,8 +939,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
        __m128 tmp = max4;
        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
-        fi.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
-        const float max_scalar = fi.f;
+        const float max_scalar = ((v4f32)max4)[0];

        // Quantize these floats
        const float d = max_scalar / 127.f;
@ -1263,7 +1244,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)

 #elif defined(__loongarch_asx)
    for (int i = 0; i < nb; i++) {
-        ft_union ft;
        __m256 v0 = (__m256)__lasx_xvld( x , 0 );
        __m256 v1 = (__m256)__lasx_xvld( x , 32 );
        __m256 v2 = (__m256)__lasx_xvld( x , 64 );
@ -1281,8 +1261,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
        __m128 tmp = max4;
        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
-        ft.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
-        const float max_scalar = ft.f;
+        const float max_scalar = ((v4f32)max4)[0];

        // Quantize these floats
        const float d = max_scalar / 127.f;
@ -6154,9 +6133,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
    acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);


-    ft_union fi;
-    fi.i = __lsx_vpickve2gr_w(acc_m, 0);
-    *s = hsum_float_8(acc) + fi.f ;
+    *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
 #else

    const uint8_t * scales = (const uint8_t*)&utmp[0];
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -1078,29 +1078,23 @@ do {                                                              \
 #define GGML_F16_STEP 32
 #define GGML_F16_EPR  8

-// F16 arithmetic is not supported by AVX, so we use F32 instead
+// F16 arithmetic is not supported by LASX, so we use F32 instead

 #define GGML_F32Cx8          __m256
 #define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
 #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))

 static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
-    float tmp[8];
-
-    for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-
-    return (__m256)__lasx_xvld(tmp, 0);
+    __m256i a;
+    memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
+    a = __lasx_xvpermi_d(a, 0 | (1 << 4));
+    return __lasx_xvfcvtl_s_h(a);
 }
+
 static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
-    float arr[8];
-
-    __lasx_xvst(y, arr, 0);
-
-    for (int i = 0; i < 8; i++) {
-        x[i] = GGML_FP32_TO_FP16(arr[i]);
-    }
+    __m256i a = __lasx_xvfcvt_h_s(y, y);
+    a = __lasx_xvpermi_d(a, 0 | (2 << 2));
+    memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
 }
 #define GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
 #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
@ -13862,9 +13856,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
            tp->ec    = GGML_STATUS_ABORTED;
        }

-        ggml_barrier(state->threadpool);
+        if (node_n + 1 < cgraph->n_nodes) {
+            ggml_barrier(state->threadpool);
+        }
    }

+    ggml_barrier(state->threadpool);
+
    return 0;
 }

--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@ -16,7 +16,7 @@
 #include "common.cuh"


-#if CUDART_VERSION >= 11800
+#if CUDART_VERSION >= 11080

 static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
    int ret = 0;
@ -50,7 +50,7 @@ static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
    return ret_low | ret_high;
 }

-#endif // CUDART_VERSION >= 11800
+#endif // CUDART_VERSION >= 11080


 template <typename T>
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@ -103,11 +103,10 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
    name = std::regex_replace(name, std::regex("\\(TM\\)"), "");

    auto global_mem_size = prop.get_global_mem_size()/1000000;
-    std::string xmx = gpu_has_xmx(device) ? "yes" : "no";
-    GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|%14s|\n", id, device_type.c_str(),
+    GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
            name.c_str(), version.c_str(), prop.get_max_compute_units(),
            prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
-            global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str(), xmx.c_str());
+            global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
 }

 void ggml_backend_sycl_print_sycl_devices() {
@ -118,16 +117,16 @@ void ggml_backend_sycl_print_sycl_devices() {

    GGML_LOG_INFO(
        "|  |                   |                                       |      "
-        " |Max    |        |Max  |Global |                     |         XMX  |\n");
+        " |Max    |        |Max  |Global |                     |\n");
    GGML_LOG_INFO(
        "|  |                   |                                       |      "
-        " |compute|Max work|sub  |mem    |                     |          or  |\n");
+        " |compute|Max work|sub  |mem    |                     |\n");
    GGML_LOG_INFO(
        "|ID|        Device Type|                                   "
-        "Name|Version|units  |group   |group|size   |       Driver version| Tensor Cores |\n");
+        "Name|Version|units  |group   |group|size   |       Driver version|\n");
    GGML_LOG_INFO(
        "|--|-------------------|---------------------------------------|------"
-        "-|-------|--------|-----|-------|---------------------|--------------|\n");
+        "-|-------|--------|-----|-------|---------------------|\n");

    for (int id = 0; id < device_count; ++id) {
      sycl::device device = dpct::dev_mgr::instance().get_device(id);
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -167,6 +167,7 @@ struct vk_device_struct {
    uint32_t subgroup_size;
    uint32_t shader_core_count;
    bool uma;
+    bool prefer_host_memory;
    bool float_controls_rte_fp16;

    bool subgroup_size_control;
@ -184,12 +185,12 @@ struct vk_device_struct {

    size_t idx;

-    bool mul_mat_l;
-    bool mul_mat_m;
-    bool mul_mat_s;
-    bool mul_mat_id_l;
-    bool mul_mat_id_m;
-    bool mul_mat_id_s;
+    bool mul_mat_l[GGML_TYPE_COUNT];
+    bool mul_mat_m[GGML_TYPE_COUNT];
+    bool mul_mat_s[GGML_TYPE_COUNT];
+    bool mul_mat_id_l[GGML_TYPE_COUNT];
+    bool mul_mat_id_m[GGML_TYPE_COUNT];
+    bool mul_mat_id_s[GGML_TYPE_COUNT];

    // set to true to indicate that some shaders need to be compiled after the dryrun
    bool need_compiles {};
@ -1294,7 +1295,9 @@ static vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk:
 static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
    vk_buffer buf;
    try {
-        if (device->uma) {
+        if (device->prefer_host_memory) {
+            buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal);
+        } else if (device->uma) {
            // Fall back to host memory type
            buf = ggml_vk_create_buffer(device, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
        } else {
@ -1378,7 +1381,33 @@ static std::array<uint32_t, 2> fa_rows_cols(uint32_t D, uint32_t clamp, ggml_typ
    return {64, 64};
 };

-static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id) {
+static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id, ggml_type src0_type) {
+
+    uint32_t lut_size = 0;
+    switch (src0_type) {
+    case GGML_TYPE_IQ2_XXS:
+        lut_size = 8*256;
+        break;
+    case GGML_TYPE_IQ2_XS:
+        lut_size = 8*512;
+        break;
+    case GGML_TYPE_IQ2_S:
+        lut_size = 8*1024;
+        break;
+    case GGML_TYPE_IQ3_XXS:
+        lut_size = 4*256;
+        break;
+    case GGML_TYPE_IQ3_S:
+        lut_size = 4*512;
+        break;
+    case GGML_TYPE_IQ4_NL:
+    case GGML_TYPE_IQ4_XS:
+        lut_size = 4*16;
+        break;
+    default:
+        break;
+    }
+
    // Needs to be kept up to date on shader changes
    const uint32_t bank_conflict_offset = device->coopmat_support ? 8 : 1;
    const uint32_t type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
@ -1388,7 +1417,13 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
    const uint32_t mmid_row_ids = mul_mat_id ? 3072 * sizeof(uint32_t) : 0;
    const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0;

-    return (load_bufs + mmid_row_ids + coopmat_stage) <= device->properties.limits.maxComputeSharedMemorySize;
+    const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size;
+    const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize;
+
+    VK_LOG_DEBUG("ggml_vk_matmul_shmem_support(warptile=(" << warptile[0] << "," << warptile[1] << "," << warptile[2] << "), "
+                 "mul_mat_id=" << mul_mat_id << ", src0_type=" << ggml_type_name(src0_type) << ", supported=" << supported);
+
+    return supported;
 }

 static void ggml_vk_load_shaders(vk_device& device) {
@ -1472,62 +1507,32 @@ static void ggml_vk_load_shaders(vk_device& device) {
        m_align =  64;
        s_align =  32;

-        // Fallback to smaller sizes if there's not enough shared memory. Given the current shaders
-        // and tile sizes, this should handle 16KB, 32KB, and 48KB+.
-        // This logic doesn't explicitly account for the 12KB row_ids in the mul_mat_mat_id shaders.
-        // But the numbers happen to work out for 32KB shared memory size that when using the medium
-        // size there's enough room for everything, and we assert for this.
-        uint32_t shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float);
-        if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) {
-            l_warptile = m_warptile;
-            l_wg_denoms = m_wg_denoms;
-            shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float);
-            GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize);
-        }
-        if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
-            // assert mul_mat_mat_id shaders will fit.
-            GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize);
-        }
-
-        shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float);
-        if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) {
-            if (device->properties.limits.maxComputeSharedMemorySize == 32768) {
-                l_warptile_mmq = m_warptile_mmq;
-                l_mmq_wg_denoms = m_mmq_wg_denoms;
-            } else {
-                l_warptile_mmq = s_warptile_mmq;
-                l_mmq_wg_denoms = s_mmq_wg_denoms;
+        for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) {
+            ggml_type t = (ggml_type)i;
+            // Disable medium and large matrix multiplication if not enough shared memory is available
+            // Check mmq warptiles as the largest configuration
+            // Throw an error if not enough for any matrix multiplication is available
+            if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, false, t)) {
+                std::cerr << "ggml_vulkan: Error: Shared memory size too small for matrix multiplication." << std::endl;
+                throw std::runtime_error("Shared memory size too small for matrix multiplication.");
+            } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, false, t)) {
+                device->mul_mat_m[i] = false;
+                device->mul_mat_l[i] = false;
+            } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, false, t)) {
+                device->mul_mat_l[i] = false;
            }
-            shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float);
-            GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize);
-        }
-        if (device->properties.limits.maxComputeSharedMemorySize >= 32768) {
-            // assert mul_mat_mat_id shaders will fit.
-            GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize);
-        }
-        // Disable medium and large matrix multiplication if not enough shared memory is available
-        // Check mmq warptiles as the largest configuration
-        // Throw an error if not enough for any matrix multiplication is available
-        if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, false)) {
-            std::cerr << "ggml_vulkan: Error: Shared memory size too small for matrix multiplication." << std::endl;
-            throw std::runtime_error("Shared memory size too small for matrix multiplication.");
-        } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, false)) {
-            device->mul_mat_m = false;
-            device->mul_mat_l = false;
-        } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, false)) {
-            device->mul_mat_l = false;
-        }

-        // Disable mul_mat_id if not enough shared memory is available
-        if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, true)) {
-            device->mul_mat_id_s = false;
-            device->mul_mat_id_m = false;
-            device->mul_mat_id_l = false;
-        } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, true)) {
-            device->mul_mat_id_m = false;
-            device->mul_mat_id_l = false;
-        } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, true)) {
-            device->mul_mat_id_l = false;
+            // Disable mul_mat_id if not enough shared memory is available
+            if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, true, t)) {
+                device->mul_mat_id_s[i] = false;
+                device->mul_mat_id_m[i] = false;
+                device->mul_mat_id_l[i] = false;
+            } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, true, t)) {
+                device->mul_mat_id_m[i] = false;
+                device->mul_mat_id_l[i] = false;
+            } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, true, t)) {
+                device->mul_mat_id_l[i] = false;
+            }
        }
    }

@ -1684,119 +1689,116 @@ static void ggml_vk_load_shaders(vk_device& device) {
 #if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
    if (device->coopmat_support) {
        // Create 6 variants, {s,m,l}x{unaligned,aligned}
-#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
-        if (device->mul_mat ## ID ## _l) \
+#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        if (device->mul_mat ## ID ## _l[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true);   \
-        if (device->mul_mat ## ID ## _m) \
+        if (device->mul_mat ## ID ## _m[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true);   \
-        if (device->mul_mat ## ID ## _s) \
+        if (device->mul_mat ## ID ## _s[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true);   \
-        if (device->mul_mat ## ID ## _l) \
+        if (device->mul_mat ## ID ## _l[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true);   \
-        if (device->mul_mat ## ID ## _m) \
+        if (device->mul_mat ## ID ## _m[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true);   \
-        if (device->mul_mat ## ID ## _s) \
+        if (device->mul_mat ## ID ## _s[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true);   \

        // Create 2 variants, {f16,f32} accumulator
-#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+#define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
        if (device->coopmat_acc_f16_support) { \
-            CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+            CREATE_MM(TYPE, PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
        } \
        if (device->coopmat_acc_f32_support) { \
-            CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+            CREATE_MM(TYPE, PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
        } \

-        CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
-        CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
-        CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );

        if (device->coopmat_acc_f16_support) {
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
        } else {
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-            CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+            CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
        }

-        // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
-        if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) {
-            CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
-            CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
-            CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);

-            if (device->coopmat_acc_f16_support) {
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        if (device->coopmat_acc_f16_support) {
+            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);

-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc,  matmul_id_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc,   matmul_id_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc,   matmul_id_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc,  matmul_id_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc,  matmul_id_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            } else {
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc,  matmul_id_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc,   matmul_id_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc,   matmul_id_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc,  matmul_id_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc,  matmul_id_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        } else {
+            CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);

-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc,  matmul_id_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc,   matmul_id_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc,   matmul_id_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc,  matmul_id_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-                CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc,  matmul_id_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            }
+            CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc,  matmul_id_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc,   matmul_id_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc,   matmul_id_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc,  matmul_id_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+            CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc,  matmul_id_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
        }
 #undef CREATE_MM2
 #undef CREATE_MM
@ -1804,141 +1806,135 @@ static void ggml_vk_load_shaders(vk_device& device) {
 #endif  // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
    if (device->fp16) {
        // Create 6 variants, {s,m,l}x{unaligned,aligned}
-#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
-        if (device->mul_mat ## ID ## _l) \
+#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        if (device->mul_mat ## ID ## _l[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
-        if (device->mul_mat ## ID ## _m) \
+        if (device->mul_mat ## ID ## _m[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
-        if (device->mul_mat ## ID ## _s) \
+        if (device->mul_mat ## ID ## _s[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
-        if (device->mul_mat ## ID ## _l) \
+        if (device->mul_mat ## ID ## _l[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align);   \
-        if (device->mul_mat ## ID ## _m) \
+        if (device->mul_mat ## ID ## _m[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align);   \
-        if (device->mul_mat ## ID ## _s) \
+        if (device->mul_mat ## ID ## _s[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align);   \

        // Create 2 variants, {f16,f32} accumulator
-#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
-        CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
-        CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+#define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        CREATE_MM(TYPE, PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        CREATE_MM(TYPE, PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \

-        CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
-        CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
-        CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );

-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f16acc, matmul_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f16acc,  matmul_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f16acc,   matmul_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f16acc, matmul_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f16acc,   matmul_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f16acc,  matmul_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc,  matmul_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

-        // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
-        if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) {
-            CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
-            CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
-            CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM2(GGML_TYPE_F16, pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);

-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);

-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc,  matmul_id_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc,   matmul_id_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc,   matmul_id_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc,  matmul_id_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc,  matmul_id_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-        }
+        CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f16acc, matmul_id_iq2_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ2_XS,  pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f16acc,  matmul_id_iq2_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ2_S,   pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f16acc,   matmul_id_iq2_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f16acc, matmul_id_iq3_xxs_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ3_S,   pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f16acc,   matmul_id_iq3_s_f32,   _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ4_XS,  pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f16acc,  matmul_id_iq4_xs_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ4_NL,  pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc,  matmul_id_iq4_nl_f32,  _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
 #undef CREATE_MM2
 #undef CREATE_MM
    } else {
        // Create 6 variants, {s,m,l}x{unaligned,aligned}
-#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
-        if (device->mul_mat ## ID ## _l) \
+#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
+        if (device->mul_mat ## ID ## _l[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
-        if (device->mul_mat ## ID ## _m) \
+        if (device->mul_mat ## ID ## _m[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1);   \
-        if (device->mul_mat ## ID ## _s) \
+        if (device->mul_mat ## ID ## _s[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1);   \
-        if (device->mul_mat ## ID ## _l) \
+        if (device->mul_mat ## ID ## _l[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align);   \
-        if (device->mul_mat ## ID ## _m) \
+        if (device->mul_mat ## ID ## _m[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align);   \
-        if (device->mul_mat ## ID ## _s) \
+        if (device->mul_mat ## ID ## _s[TYPE]) \
            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align);   \

-        CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_F16, pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );

-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc,  matmul_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc,   matmul_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc,   matmul_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc,  matmul_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
-        CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc,  matmul_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XXS].f32acc, matmul_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_XS].f32acc,  matmul_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ2_S].f32acc,   matmul_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_XXS].f32acc, matmul_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ3_S].f32acc,   matmul_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_XS].f32acc,  matmul_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
+        CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc,  matmul_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );

-        // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
-        if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) {
-            CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
-            CREATE_MM(pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
-            CREATE_MM(pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_F32, pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_F16, pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);

-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f32acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f32acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q4_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q4_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q5_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q5_1, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f32acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q8_0, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f32acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);

-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f32acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f32acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f32acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f32acc,  matmul_id_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc,   matmul_id_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc,   matmul_id_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f32acc,  matmul_id_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-            CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc,  matmul_id_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
-        }
+        CREATE_MM(GGML_TYPE_Q2_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f32acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q3_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f32acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q4_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q5_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_Q6_K, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ2_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XXS].f32acc, matmul_id_iq2_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ2_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_XS].f32acc,  matmul_id_iq2_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ2_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ2_S].f32acc,   matmul_id_iq2_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ3_XXS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_XXS].f32acc, matmul_id_iq3_xxs_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ3_S, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ3_S].f32acc,   matmul_id_iq3_s_f32,   , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ4_XS, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_XS].f32acc,  matmul_id_iq4_xs_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
+        CREATE_MM(GGML_TYPE_IQ4_NL, pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc,  matmul_id_iq4_nl_f32,  , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
 #undef CREATE_MM
    }

@ -2206,6 +2202,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
        device->physical_device = physical_devices[dev_num];
        const std::vector<vk::ExtensionProperties> ext_props = device->physical_device.enumerateDeviceExtensionProperties();

+        const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY");
+        device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr;
+
        bool fp16_storage = false;
        bool fp16_compute = false;
        bool maintenance4_support = false;
@ -2623,34 +2622,36 @@ static vk_device ggml_vk_get_device(size_t idx) {

        // Shaders
        // Disable matmul tile sizes early if performance low or not supported
-        switch (device->vendor_id) {
+        for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) {
+            switch (device->vendor_id) {
 #ifndef GGML_VULKAN_RUN_TESTS
-        case VK_VENDOR_ID_AMD:
-        case VK_VENDOR_ID_INTEL:
-            device->mul_mat_l = false;
-            device->mul_mat_m = true;
-            device->mul_mat_s = true;
-            device->mul_mat_id_l = false;
-            device->mul_mat_id_m = true;
-            device->mul_mat_id_s = true;
-            break;
-        case VK_VENDOR_ID_APPLE:
-            device->mul_mat_l = false;
-            device->mul_mat_m = true;
-            device->mul_mat_s = false;
-            device->mul_mat_id_l = false;
-            device->mul_mat_id_m = true;
-            device->mul_mat_id_s = false;
-            break;
+            case VK_VENDOR_ID_AMD:
+            case VK_VENDOR_ID_INTEL:
+                device->mul_mat_l[i] = false;
+                device->mul_mat_m[i] = true;
+                device->mul_mat_s[i] = true;
+                device->mul_mat_id_l[i] = false;
+                device->mul_mat_id_m[i] = true;
+                device->mul_mat_id_s[i] = true;
+                break;
+            case VK_VENDOR_ID_APPLE:
+                device->mul_mat_l[i] = false;
+                device->mul_mat_m[i] = true;
+                device->mul_mat_s[i] = false;
+                device->mul_mat_id_l[i] = false;
+                device->mul_mat_id_m[i] = true;
+                device->mul_mat_id_s[i] = false;
+                break;
 #endif
-        default:
-            device->mul_mat_l = true;
-            device->mul_mat_m = true;
-            device->mul_mat_s = true;
-            device->mul_mat_id_l = true;
-            device->mul_mat_id_m = true;
-            device->mul_mat_id_s = true;
-            break;
+            default:
+                device->mul_mat_l[i] = true;
+                device->mul_mat_m[i] = true;
+                device->mul_mat_s[i] = true;
+                device->mul_mat_id_l[i] = true;
+                device->mul_mat_id_m[i] = true;
+                device->mul_mat_id_s[i] = true;
+                break;
+            }
        }

        ggml_vk_load_shaders(device);
@ -2780,8 +2781,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
    std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";

    std::string device_name = props2.properties.deviceName.data();
-    GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | matrix cores: %s\n",
-              idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, matrix_cores.c_str());
+    GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | shared memory: %d | matrix cores: %s\n",
+              idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size,
+              props2.properties.limits.maxComputeSharedMemorySize, matrix_cores.c_str());

    if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
        GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n");
@ -2791,14 +2793,12 @@ static void ggml_vk_print_gpu_info(size_t idx) {
 static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
 static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);

-void ggml_vk_instance_init() {
+static void ggml_vk_instance_init() {
    if (vk_instance_initialized) {
        return;
    }
    VK_LOG_DEBUG("ggml_vk_instance_init()");

-    vk_instance_initialized = true;
-
    uint32_t api_version = vk::enumerateInstanceVersion();

    if (api_version < VK_API_VERSION_1_2) {
@ -2849,6 +2849,7 @@ void ggml_vk_instance_init() {
        GGML_LOG_DEBUG("ggml_vulkan: Validation layers enabled\n");
    }
    vk_instance.instance = vk::createInstance(instance_create_info);
+    vk_instance_initialized = true;

    size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();

@ -2873,7 +2874,7 @@ void ggml_vk_instance_init() {
        // Make sure at least one device exists
        if (devices.empty()) {
            std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
-            GGML_ABORT("fatal error");
+            return;
        }

        // Default to using all dedicated GPUs
@ -3755,31 +3756,31 @@ static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int
    return split_k;
 }

-static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
-    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")");
+static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type src0_type) {
+    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");

    if (ctx->device->coopmat2) {
-        if ((ctx->device->mul_mat_l && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_m && !ctx->device->mul_mat_s)) {
+        if ((ctx->device->mul_mat_l[src0_type] && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
            return aligned ? mmp->a_l : mmp->l;
        }
-        if ((ctx->device->mul_mat_m && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_s) {
+        if ((ctx->device->mul_mat_m[src0_type] && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_s[src0_type]) {
            return aligned ? mmp->a_m : mmp->m;
        }
        return aligned ? mmp->a_s : mmp->s;
    }

-    if ((ctx->device->mul_mat_s && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_m && !ctx->device->mul_mat_l)) {
+    if ((ctx->device->mul_mat_s[src0_type] && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_l[src0_type])) {
        return aligned ? mmp->a_s : mmp->s;
    }
-    if ((ctx->device->mul_mat_m && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_l) {
+    if ((ctx->device->mul_mat_m[src0_type] && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_l[src0_type]) {
        return aligned ? mmp->a_m : mmp->m;
    }
    return aligned ? mmp->a_l : mmp->l;
 }

-static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) {
-    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")");
-    return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align;
+static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type) {
+    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ")");
+    return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true, src0_type)->align;
 }

 static void ggml_vk_matmul(
@ -3806,31 +3807,31 @@ static void ggml_vk_matmul(
    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
 }

-static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) {
-    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")");
+static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type src0_type) {
+    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");

    if (ctx->device->coopmat2) {
-        if ((ctx->device->mul_mat_id_l && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_id_m && !ctx->device->mul_mat_id_s)) {
+        if ((ctx->device->mul_mat_id_l[src0_type] && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_s[src0_type])) {
            return aligned ? mmp->a_l : mmp->l;
        }
-        if ((ctx->device->mul_mat_id_m && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_id_s) {
+        if ((ctx->device->mul_mat_id_m[src0_type] && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_id_s[src0_type]) {
            return aligned ? mmp->a_m : mmp->m;
        }
        return aligned ? mmp->a_s : mmp->s;
    }

-    if ((ctx->device->mul_mat_id_s && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_id_m && !ctx->device->mul_mat_id_l)) {
+    if ((ctx->device->mul_mat_id_s[src0_type] && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_l[src0_type])) {
        return aligned ? mmp->a_s : mmp->s;
    }
-    if ((ctx->device->mul_mat_id_m && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_id_l) {
+    if ((ctx->device->mul_mat_id_m[src0_type] && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_id_l[src0_type]) {
        return aligned ? mmp->a_m : mmp->m;
    }
    return aligned ? mmp->a_l : mmp->l;
 }

-static uint32_t ggml_vk_guess_matmul_id_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) {
-    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")");
-    return ggml_vk_guess_matmul_id_pipeline(ctx, mmp, m, n, true)->align;
+static uint32_t ggml_vk_guess_matmul_id_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type) {
+    VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ", " << ggml_type_name(src0_type) << ")");
+    return ggml_vk_guess_matmul_id_pipeline(ctx, mmp, m, n, true, src0_type)->align;
 }

 static void ggml_vk_matmul_id(
@ -4011,10 +4012,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
    const int y_ne = ne11 * ne10;
    const int d_ne = ne11 * ne01;

-    const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11));
+    const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, qx_needs_dequant ? GGML_TYPE_F16 : src0->type));
    const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;

-    vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned);
+    vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type);

    const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);

@ -4593,10 +4594,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
    const uint64_t y_ne = ne11 * ne10;
    const uint64_t d_ne = ne21 * ne20;

-    const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1));
+    const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1, qx_needs_dequant ? GGML_TYPE_F16 : src0->type));
    const bool aligned = ne10 == kpad && ne01 > 8 && nei1 > 8;

-    vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned);
+    vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type);

    const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
    const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
@ -8035,13 +8036,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
            {
+                ggml_type src0_type = op->src[0]->type;
                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
                const vk_device& device = ggml_vk_get_device(ctx->device);
-                if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s && !device->mul_mat_id_m && !device->mul_mat_id_l) {
+                if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
                    // If there's not enough shared memory for row_ids and the result tile, fallback to CPU
                    return false;
                }
-                switch (op->src[0]->type) {
+                switch (src0_type) {
                    case GGML_TYPE_F32:
                    case GGML_TYPE_F16:
                    case GGML_TYPE_Q4_0:
@ -8347,8 +8349,13 @@ ggml_backend_reg_t ggml_backend_vk_reg() {
        /* .iface       = */ ggml_backend_vk_reg_i,
        /* .context     = */ nullptr,
    };
-
-    return &reg;
+    try {
+        ggml_vk_instance_init();
+        return &reg;
+    } catch (const vk::SystemError& e) {
+        VK_LOG_DEBUG("ggml_backend_vk_reg() -> Error: System error: " << e.what());
+        return nullptr;
+    }
 }

 // Extension availability
--- a/include/llama.h
+++ b/include/llama.h
@ -1114,11 +1114,12 @@ extern "C" {
    };

    struct llama_sampler {
-        struct llama_sampler_i  * iface;
-        llama_sampler_context_t   ctx;
+        const struct llama_sampler_i * iface;
+        llama_sampler_context_t        ctx;
    };

    // mirror of llama_sampler_i:
+    LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -1275,7 +1275,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

    const bool use_mmap_buffer = true;

-    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, use_mmap_buffer ? "true" : "false");
+    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");

    // build a list of buffer types for the CPU and GPU devices
    pimpl->cpu_buft_list = make_cpu_buft_list(devices);
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -316,6 +316,13 @@ static uint32_t get_rng_seed(uint32_t seed) {

 // llama_sampler API

+struct llama_sampler * llama_sampler_init(const struct llama_sampler_i * iface, llama_sampler_context_t ctx) {
+    return new llama_sampler {
+        /* .iface = */ iface,
+        /* .ctx   = */ ctx,
+    };
+}
+
 const char * llama_sampler_name(const struct llama_sampler * smpl) {
    if (!smpl->iface) {
        return "(null)";
@ -347,10 +354,10 @@ struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
    }

    if (smpl->ctx == nullptr) {
-        return new llama_sampler {
+        return llama_sampler_init(
            /* .iface = */ smpl->iface,
-            /* .ctx   = */ nullptr,
-        };
+            /* .ctx   = */ nullptr
+        );
    }

    GGML_ABORT("the sampler does not support cloning");
@ -472,15 +479,15 @@ static struct llama_sampler_i llama_sampler_chain_i = {
 };

 struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_chain_i,
        /* .ctx   = */ new llama_sampler_chain {
            /* .params      = */ params,
            /* .samplers    = */ {},
            /* .t_sample_us = */ 0,
            /* .n_sample    = */ 0,
-        },
-    };
+        }
+    );
 }

 void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
@ -546,10 +553,10 @@ static struct llama_sampler_i llama_sampler_greedy_i = {
 };

 struct llama_sampler * llama_sampler_init_greedy() {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_greedy_i,
-        /* .ctx   = */ nullptr,
-    };
+        /* .ctx   = */ nullptr
+    );
 }

 // dist
@ -608,14 +615,14 @@ static struct llama_sampler_i llama_sampler_dist_i = {

 struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
    auto seed_cur = get_rng_seed(seed);
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_dist_i,
        /* .ctx   = */ new llama_sampler_dist {
            /* .seed     = */ seed,
            /* .seed_cur = */ seed_cur,
            /* .rng      = */ std::mt19937(seed_cur),
-        },
-    };
+        }
+    );
 }

 // softmax
@ -638,10 +645,10 @@ static struct llama_sampler_i llama_sampler_softmax_i = {
 };

 struct llama_sampler * llama_sampler_init_softmax() {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_softmax_i,
-        /* .ctx   = */ nullptr,
-    };
+        /* .ctx   = */ nullptr
+    );
 }

 // top-k
@ -678,12 +685,12 @@ static struct llama_sampler_i llama_sampler_top_k_i = {
 };

 struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_top_k_i,
        /* .ctx   = */ new llama_sampler_top_k {
            /* .k = */ k,
-        },
-    };
+        }
+    );
 }

 // top-p
@ -744,13 +751,13 @@ static struct llama_sampler_i llama_sampler_top_p_i = {
 };

 struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_top_p_i,
        /* .ctx   = */ new llama_sampler_top_p {
            /* .p        = */ p,
            /* .min_keep = */ min_keep,
-        },
-    };
+        }
+    );
 }

 // min-p
@ -840,13 +847,13 @@ static struct llama_sampler_i llama_sampler_min_p_i = {
 };

 struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_min_p_i,
        /* .ctx   = */ new llama_sampler_min_p {
            /* .p        = */ p,
            /* .min_keep = */ min_keep,
-        },
-    };
+        }
+    );
 }

 // typical
@ -939,13 +946,13 @@ static struct llama_sampler_i llama_sampler_typical_i = {
 };

 struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_typical_i,
        /* .ctx   = */ new llama_sampler_typical {
            /* .p        = */ p,
            /* .min_keep = */ min_keep,
-        },
-    };
+        }
+    );
 }

 // temp
@ -983,12 +990,12 @@ static struct llama_sampler_i llama_sampler_temp_i = {
 };

 struct llama_sampler * llama_sampler_init_temp(float temp) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_temp_i,
        /* .ctx   = */ new llama_sampler_temp {
            /*.temp = */ temp,
-        },
-    };
+        }
+    );
 }

 // temp-ext
@ -1093,14 +1100,14 @@ static struct llama_sampler_i llama_sampler_temp_ext_i = {
 };

 struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_temp_ext_i,
        /* .ctx   = */ new llama_sampler_temp_ext {
            /* .temp     = */ temp,
            /* .delta    = */ delta,
            /* .exponent = */ exponent,
-        },
-    };
+        }
+    );
 }

 // xtc
@ -1185,7 +1192,7 @@ static struct llama_sampler_i llama_sampler_xtc_i = {

 struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
    auto seed_cur = get_rng_seed(seed);
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_xtc_i,
        /* .ctx   = */ new llama_sampler_xtc {
            /* .probability   = */ p,
@ -1194,8 +1201,8 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
            /* .seed          = */ seed,
            /* .seed_cur      = */ seed_cur,
            /* .rng           = */ std::mt19937(seed_cur),
-        },
-    };
+        }
+    );
 }

 // mirostat
@ -1292,7 +1299,7 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {

 struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
    auto seed_cur = get_rng_seed(seed);
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_mirostat_i,
        /* .ctx   = */ new llama_sampler_mirostat {
            /* .n_vocab  = */ n_vocab,
@ -1303,8 +1310,8 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
            /* .m        = */ m,
            /* .mu       = */ 2.0f*tau,
            /* .rng      = */ std::mt19937(seed_cur),
-        },
-    };
+        }
+    );
 }

 // mirostat v2
@ -1391,7 +1398,7 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {

 struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
    auto seed_cur = get_rng_seed(seed);
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_mirostat_v2_i,
        /* .ctx   = */ new llama_sampler_mirostat_v2 {
            /* .seed     = */ seed,
@ -1400,8 +1407,8 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
            /* .eta      = */ eta,
            /* .mu       = */ 2.0f*tau,
            /* .rng      = */ std::mt19937(seed_cur),
-        },
-    };
+        }
+    );
 }

 // grammar
@ -1528,10 +1535,10 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
        };
    }

-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_grammar_i,
-        /* .ctx   = */ ctx,
-    };
+        /* .ctx   = */ ctx
+    );
 }

 struct llama_sampler * llama_sampler_init_grammar(
@ -1678,7 +1685,7 @@ struct llama_sampler * llama_sampler_init_penalties(
        float penalty_present) {
    penalty_last_n = std::max(penalty_last_n, 0);

-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_penalties_i,
        /* .ctx   = */ new llama_sampler_penalties {
            /* .penalty_last_n  = */ penalty_last_n,
@ -1687,8 +1694,8 @@ struct llama_sampler * llama_sampler_init_penalties(
            /* .penalty_present = */ penalty_present,
            /* .prev            = */ ring_buffer<llama_token>(penalty_last_n),
            /* .token_count     = */ {},
-        },
-    };
+        }
+    );
 }

 // DRY
@ -2041,7 +2048,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
        }
    }

-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_dry_i,
        /* .ctx   = */ new llama_sampler_dry {
            /* .total_context_size     = */ context_size,
@ -2053,8 +2060,8 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
            /* .dry_repeat_count       = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
            /* .dry_max_token_repeat   = */ {},
            /* .last_tokens            = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
-        },
-    };
+        }
+    );
 }

 // wrapper for test-sampling.cpp
@ -2155,14 +2162,14 @@ struct llama_sampler * llama_sampler_init_logit_bias(
                         int32_t   n_vocab,
                         int32_t   n_logit_bias,
          const llama_logit_bias * logit_bias) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_logit_bias_i,
        /* .ctx   = */ new llama_sampler_logit_bias {
            /* .n_vocab    = */ n_vocab,
            /* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
            /* .to_search  = */ {},
-        },
-    };
+        }
+    );
 }

 // infill
@ -2377,14 +2384,14 @@ static struct llama_sampler_i llama_sampler_infill_i = {
 };

 struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
-    return new llama_sampler {
+    return llama_sampler_init(
        /* .iface = */ &llama_sampler_infill_i,
        /* .ctx   = */ new llama_sampler_infill {
            /* .vocab = */ vocab,
            /* .buf0  = */ std::vector<char>(512),
            /* .buf1  = */ std::vector<char>(512),
-        },
-    };
+        }
+    );
 }

 // utils
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -8801,12 +8801,14 @@ static int llama_decode_impl(
    //llama_synchronize(&lctx);

    // decide if we need to defrag the kv cache
-    if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
-        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
+    if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
+        // - do not defrag small contexts (i.e. < 2048 tokens)
+        // - count the padding towards the number of used tokens
+        const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + llama_kv_cache_get_padding(cparams))/float(kv_self.n)) : 0.0f;

        // queue defragmentation for next llama_kv_cache_update
        if (fragmentation > cparams.defrag_thold) {
-            //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
+            LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);

            llama_kv_cache_defrag(kv_self);
        }
@ -9428,8 +9430,6 @@ static struct llama_model * llama_model_load_from_file_impl(
        struct llama_model_params params) {
    ggml_time_init();

-    llama_model * model = new llama_model(params);
-
    unsigned cur_percentage = 0;
    if (params.progress_callback == NULL) {
        params.progress_callback_user_data = &cur_percentage;
@ -9447,6 +9447,8 @@ static struct llama_model * llama_model_load_from_file_impl(
        };
    }

+    llama_model * model = new llama_model(params);
+
    // create list of devices to use with this model
    if (params.devices) {
        for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@ -618,7 +618,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
    result.reserve(utf8.size());
    size_t offset = 0;
    while (offset < utf8.size()) {
-        result.push_back(unicode_cpt_from_utf8(utf8, offset));
+        try {
+            result.push_back(unicode_cpt_from_utf8(utf8, offset));
+        }
+        catch (const std::invalid_argument & /*ex*/) {
+            // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
+            ++offset;
+            result.emplace_back(0xFFFD); // replacement character
+        }
    }
    return result;
 }
Author	SHA1	Message	Date
Olivier Chafik	d7b31a9d84	sync: minja (`a72057e519`) (#11774 )	2025-02-10 09:34:09 +00:00
pascal-lc	9ac3457b39	Update README.md [no ci] (#11781 ) typo: `\` -> `/` Change the UNIX path separator to` \`.	2025-02-10 09:05:57 +01:00
Danny Milosavljevic	c2a67efe38	vulkan: Make Vulkan optional at runtime (#11493 ). (#11494 ) Co-authored-by: Jeff Bolz <jbolz@nvidia.com>	2025-02-10 07:17:21 +01:00
Wagner Bruna	b044a0fe3c	vulkan: add environment variable GGML_VK_PREFER_HOST_MEMORY to avoid VRAM allocation (#11592 )	2025-02-10 07:08:22 +01:00
Eric Curtin	19d3c8293b	There's a better way of clearing lines (#11756 ) Use the ANSI escape code for clearing a line. Signed-off-by: Eric Curtin <ecurtin@redhat.com>	2025-02-09 10:34:49 +00:00
Jeff Bolz	98f6b0fd1e	vulkan: account for lookup tables when checking shared memory size (#11502 )	2025-02-09 08:43:51 +01:00
Xuan-Son Nguyen	55ac8c7791	server : (webui) revamp Settings dialog, add Pyodide interpreter (#11759 ) * redo Settings modal UI * add python code interpreter * fix auto scroll * build * fix overflow for long output lines * bring back sticky copy button * adapt layout on mobile view * fix multiple lines output and color scheme * handle python exception * better state management * add webworker * add headers * format code * speed up by loading pyodide on page load * (small tweak) add small animation to make it feels like claude	2025-02-08 21:54:50 +01:00
Woof Dog	e6e6583199	server : (webui) increase edit textarea size (#11763 )	2025-02-08 20:09:55 +01:00
Georgi Gerganov	aaa5505307	server : minor log updates (#11760 ) ggml-ci	2025-02-08 18:08:43 +02:00
Georgi Gerganov	bdcf8b6a56	cont : fix mmap flag print (#11699 )	2025-02-08 16:49:38 +02:00
Karol Kontny	4d3465c5ae	ggml: Fix data race in ggml threadpool (#11736 ) After the barrier in last iteration is executed, still the loop termination condition will be executed. However main thread can destroy the cgraph object and its nodes already, then another thread will access it, but the thing is already gone. Also trouble can happen when n_nodes == 0 or abort is called, but I'm not sure if the prior situation is possible. Last syncronization should be done after the loop to ensure the cgraph/cplan won't be accessed after the main thread exits from the function.	2025-02-08 15:30:53 +01:00
Johannes Gäßler	d80be897ac	CUDA: fix min. version for movmatrix (#11751 )	2025-02-08 10:46:07 +01:00
Nikolaos Pothitos	3ab410f55f	readme : update front-end framework (#11753 ) After the migration to React with #11688	2025-02-08 10:43:04 +01:00
Xuan-Son Nguyen	0cf867160c	server : (webui) fix numeric settings being saved as string (#11739 ) * server : (webui) fix numeric settings being saved as string * add some more comments	2025-02-08 10:42:34 +01:00
Eric Curtin	d2fe216fb2	Make logging more verbose (#11714 ) Debugged an issue with a user who was on a read-only filesystem. Signed-off-by: Eric Curtin <ecurtin@redhat.com>	2025-02-07 14:42:46 +00:00
Georgi Gerganov	ed926d8833	llama : fix defrag logic (#11707 ) * llama : fix defrag logic ggml-ci * cont : better logic ggml-ci * cont : clamp fragmentation to 0.0 ggml-ci	2025-02-07 16:05:34 +02:00
Christian Fillion	2d219b389e	vocab : ignore invalid UTF-8 input in the BPE tokenizer (#11729 ) Silently insert U+FFFD(s) (Unicode replacement character) instead until the next valid codepoint can be found. This fixes `llama_tokenize` throwing an exception across the C API boundary or libllama's module boundary (the caller's runtime might be incompatible!) Returing a proper error code might be desirable, however the signature of `llama_tokenize` doesn't allow it as all return values already have existing meaning.	2025-02-07 15:55:47 +02:00
magicse	333820d749	llama : fix progress dots (#11730 ) * Update llama.cpp For display progress dots in terminal. Without this it didn't display dots progress during loading model from file. * Update llama.cpp removed trailing spaces	2025-02-07 15:48:47 +02:00
Jeff Bolz	c026ba3c23	vulkan: print shared memory size (#11719 )	2025-02-07 11:26:03 +01:00
Christian Fillion	7ee953a64a	llama : add llama_sampler_init for safe usage of llama_sampler_free (#11727 ) The C API in llama.h claims users can implement `llama_sampler_i` to create custom `llama_sampler`. The sampler chain takes ownership and calls `llama_sampler_free` on them. However, `llama_sampler_free` is hard-coded to use `delete`. This is undefined behavior if the object wasn't also allocated via `new` from libllama's C++ runtime. Callers in C and C-compatible languages do not use C++'s `new` operator. C++ callers may not be sharing the same heap as libllama.	2025-02-07 11:33:27 +02:00
Akarshan Biswas	ec3bc8270b	SYCL: remove XMX info from print devices (#11712 )	2025-02-07 09:27:53 +00:00
Daniel Bevenius	b7552cfcbc	common : add default embeddings presets (#11677 ) * common : add default embeddings presets This commit adds default embeddings presets for the following models: - bge-small-en-v1.5 - e5-small-v2 - gte-small These can be used with llama-embedding and llama-server. For example, with llama-embedding: ```console ./build/bin/llama-embedding --embd-gte-small-default -p "Hello, how are you?" ``` And with llama-server: ```console ./build/bin/llama-server --embd-gte-small-default ``` And the embeddings endpoint can then be called with a POST request: ```console curl --request POST \ --url http://localhost:8080/embeddings \ --header "Content-Type: application/json" \ --data '{"input": "Hello, how are you?"}' ``` I'm not sure if these are the most common embedding models but hopefully this can be a good starting point for discussion and further improvements. Refs: https://github.com/ggerganov/llama.cpp/issues/10932	2025-02-07 09:15:22 +01:00
Jinyang He	225bbbfa39	ggml : optimize and build warning fix for LoongArch (#11709 ) * ggml : optimize convert f32<->f16 for loongarch_asx * ggml : optimize loongarch_asx extend i16,i8,u8 to i32,i16 * ggml : Fix warnings when run cpu CI locally on LoongArch	2025-02-07 09:38:31 +02:00