Merge remote-tracking branch 'origin/master' into sync

ggml-ci
2024-02-11 21:08:09 +01:00 · 2024-02-11 21:08:09 +01:00 · 76d5b7f76c
commit 76d5b7f76c
parent 7cba240bcf 3bdc4cd0f5
26 changed files with 1464 additions and 335 deletions
--- a/.flake8
+++ b/.flake8
@ -1,2 +1,3 @@
 [flake8]
 max-line-length = 125
 ignore = W503
--- a/common/common.cpp
+++ b/common/common.cpp
@ -340,13 +340,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            sparams.samplers_sequence = parse_samplers_input(argv[i]);
+            const auto sampler_names = string_split(argv[i], ';');
            sparams.samplers_sequence = sampler_types_from_names(sampler_names);
        } else if (arg == "--sampling-seq") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.samplers_sequence = argv[i];
+            sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
        } else if (arg == "--top-p") {
            if (++i >= argc) {
                invalid_param = true;
@ -906,6 +907,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    const llama_sampling_params & sparams = params.sparams;
    std::string sampler_type_chars;
    std::string sampler_type_names;
    for (const auto sampler_type : sparams.samplers_sequence) {
        sampler_type_chars += static_cast<char>(sampler_type);
        sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
    }
    sampler_type_names.pop_back();
    printf("\n");
    printf("usage: %s [options]\n", argv[0]);
    printf("\n");
@ -947,8 +956,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
+    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\' (default: %s)\n", sampler_type_names.c_str());
-    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
+    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
@ -1097,45 +1106,85 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
 }
 //
-// String parsing
+// String utils
 //
-std::string parse_samplers_input(std::string input) {
+std::vector<std::string> string_split(std::string input, char separator) {
-    std::string output = "";
+    std::vector<std::string> parts;
    size_t separator_pos = input.find(separator);
    while (separator_pos != std::string::npos) {
        std::string part = input.substr(0, separator_pos);
        parts.emplace_back(part);
        input = input.substr(separator_pos + 1);
        separator_pos = input.find(separator);
    }
    parts.emplace_back(input);
    return parts;
 }
 std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names) {
    // since samplers names are written multiple ways
    // make it ready for both system names and input names
-    std::unordered_map<std::string, char> samplers_symbols {
+    std::unordered_map<std::string, llama_sampler_type> sampler_name_map {
-        {"top_k",      'k'},
+        {"top_k",       llama_sampler_type::TOP_K},
-        {"top-k",      'k'},
+        {"top-k",       llama_sampler_type::TOP_K},
-        {"top_p",      'p'},
+        {"top_p",       llama_sampler_type::TOP_P},
-        {"top-p",      'p'},
+        {"top-p",       llama_sampler_type::TOP_P},
-        {"nucleus",    'p'},
+        {"nucleus",     llama_sampler_type::TOP_P},
-        {"typical_p",  'y'},
+        {"typical_p",   llama_sampler_type::TYPICAL_P},
-        {"typical-p",  'y'},
+        {"typical-p",   llama_sampler_type::TYPICAL_P},
-        {"typical",    'y'},
+        {"typical",     llama_sampler_type::TYPICAL_P},
-        {"min_p",      'm'},
+        {"min_p",       llama_sampler_type::MIN_P},
-        {"min-p",      'm'},
+        {"min-p",       llama_sampler_type::MIN_P},
-        {"tfs_z",      'f'},
+        {"tfs_z",       llama_sampler_type::TFS_Z},
-        {"tfs-z",      'f'},
+        {"tfs-z",       llama_sampler_type::TFS_Z},
-        {"tfs",        'f'},
+        {"tfs",         llama_sampler_type::TFS_Z},
-        {"temp",       't'},
+        {"temp",        llama_sampler_type::TEMP},
-        {"temperature",'t'}
+        {"temperature", llama_sampler_type::TEMP}
    };
    // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
    size_t separator = input.find(';');
    while (separator != input.npos) {
        std::string name = input.substr(0,separator);
        input = input.substr(separator+1);
        separator = input.find(';');
-        if (samplers_symbols.find(name) != samplers_symbols.end()) {
+    std::vector<llama_sampler_type> sampler_types;
-            output += samplers_symbols[name];
+    sampler_types.reserve(names.size());
    for (const auto& name : names) {
        const auto sampler_item = sampler_name_map.find(name);
        if (sampler_item != sampler_name_map.end()) {
            sampler_types.push_back(sampler_item->second);
        }
    }
-    if (samplers_symbols.find(input) != samplers_symbols.end()) {
+    return sampler_types;
-        output += samplers_symbols[input];
+}
 std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string) {
    std::unordered_map<char, llama_sampler_type> sampler_name_map {
        {'k', llama_sampler_type::TOP_K},
        {'p', llama_sampler_type::TOP_P},
        {'y', llama_sampler_type::TYPICAL_P},
        {'m', llama_sampler_type::MIN_P},
        {'f', llama_sampler_type::TFS_Z},
        {'t', llama_sampler_type::TEMP}
    };
    std::vector<llama_sampler_type> sampler_types;
    sampler_types.reserve(names_string.size());
    for (const auto & c : names_string) {
        const auto sampler_item = sampler_name_map.find(c);
        if (sampler_item != sampler_name_map.end()) {
            sampler_types.push_back(sampler_item->second);
        }
    }
    return sampler_types;
 }
 std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
    switch (sampler_type) {
        case llama_sampler_type::TOP_K:     return "top_k";
        case llama_sampler_type::TFS_Z:     return "tfs_z";
        case llama_sampler_type::TYPICAL_P: return "typical_p";
        case llama_sampler_type::TOP_P:     return "top_p";
        case llama_sampler_type::MIN_P:     return "min_p";
        case llama_sampler_type::TEMP:      return "temp";
        default : return "";
    }
    return output;
 }
 //
@ -1550,6 +1599,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
    fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
    fprintf(stream, "cpu_has_vsx: %s\n",         ggml_cpu_has_vsx()         ? "true" : "false");
    fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
 #ifdef NDEBUG
    fprintf(stream, "debug: false\n");
--- a/common/common.h
+++ b/common/common.h
@ -162,10 +162,13 @@ std::string gpt_random_prompt(std::mt19937 & rng);
 void process_escapes(std::string& input);
 //
-// String parsing
+// String utils
 //
-std::string parse_samplers_input(std::string input);
+std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names);
 std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
 std::vector<std::string> string_split(std::string input, char separator);
 std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
 //
 // Model utils
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -103,15 +103,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
 std::string llama_sampling_order_print(const llama_sampling_params & params) {
    std::string result = "CFG -> Penalties ";
    if (params.mirostat == 0) {
-        for (auto s : params.samplers_sequence) {
+        for (auto sampler_type : params.samplers_sequence) {
-            switch (s) {
+            const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
-                case 'k': result += "-> top_k "; break;
+            if (!sampler_type_name.empty()) {
-                case 'f': result += "-> tfs_z "; break;
+                result += "-> " + sampler_type_name + " ";
                case 'y': result += "-> typical_p "; break;
                case 'p': result += "-> top_p "; break;
                case 'm': result += "-> min_p "; break;
                case 't': result += "-> temp "; break;
                default : break;
            }
        }
    } else {
@ -127,8 +122,6 @@ static void sampler_queue(
            const llama_sampling_params & params,
                 llama_token_data_array & cur_p,
                                 size_t & min_keep) {
    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
    const float         temp              = params.temp;
    const float         dynatemp_range    = params.dynatemp_range;
    const float         dynatemp_exponent = params.dynatemp_exponent;
@ -137,16 +130,16 @@ static void sampler_queue(
    const float         min_p             = params.min_p;
    const float         tfs_z             = params.tfs_z;
    const float         typical_p         = params.typical_p;
-    const std::string & samplers_sequence = params.samplers_sequence;
+    const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
-    for (auto s : samplers_sequence) {
+    for (auto sampler_type : samplers_sequence) {
-        switch (s){
+        switch (sampler_type) {
-            case 'k': llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
+            case llama_sampler_type::TOP_K    : llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
-            case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
+            case llama_sampler_type::TFS_Z    : llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
-            case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
+            case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
-            case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
+            case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
-            case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
+            case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case 't':
+            case llama_sampler_type::TEMP:
                if (dynatemp_range > 0) {
                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
--- a/common/sampling.h
+++ b/common/sampling.h
@ -8,6 +8,16 @@
 #include <vector>
 #include <unordered_map>
 // sampler types
 enum class llama_sampler_type : char {
    TOP_K     = 'k',
    TOP_P     = 'p',
    MIN_P     = 'm',
    TFS_Z     = 'f',
    TYPICAL_P = 'y',
    TEMP      = 't'
 };
 // sampling parameters
 typedef struct llama_sampling_params {
    int32_t     n_prev                = 64;       // number of previous tokens to remember
@ -28,7 +38,15 @@ typedef struct llama_sampling_params {
    float       mirostat_tau          = 5.00f;    // target entropy
    float       mirostat_eta          = 0.10f;    // learning rate
    bool        penalize_nl           = true;     // consider newlines as a repeatable token
-    std::string samplers_sequence     = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
+
    std::vector<llama_sampler_type> samplers_sequence = {
        llama_sampler_type::TOP_K,
        llama_sampler_type::TFS_Z,
        llama_sampler_type::TYPICAL_P,
        llama_sampler_type::TOP_P,
        llama_sampler_type::MIN_P,
        llama_sampler_type::TEMP
    };
    std::string grammar;  // optional BNF-like grammar to constrain sampling
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -209,6 +209,8 @@ class Model:
            return InternLM2Model
        if model_architecture == "MiniCPMForCausalLM":
            return MiniCPMModel
        if model_architecture == "BertModel":
            return BertModel
        return Model
    def _is_model_safetensors(self) -> bool:
@ -264,6 +266,8 @@ class Model:
            return gguf.MODEL_ARCH.INTERNLM2
        if arch == "MiniCPMForCausalLM":
            return gguf.MODEL_ARCH.MINICPM
        if arch == "BertModel":
            return gguf.MODEL_ARCH.BERT
        raise NotImplementedError(f'Architecture "{arch}" not supported!')
@ -1629,6 +1633,96 @@ in chat mode so that the conversation can end normally.")
                self.post_write_tensors(tensor_map, name, data_torch)
 class BertModel(Model):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.block_count = self.hparams["num_hidden_layers"]
    def set_gguf_parameters(self):
        # TODO(cebtenzzre): merge with parent class
        self.gguf_writer.add_name(self.dir_model.name)
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_block_count(self.block_count)
        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
        self.gguf_writer.add_causal_attention(False)
        self.gguf_writer.add_file_type(self.ftype)
    def set_vocab(self):
        path = self.dir_model
        added_tokens_path = self.dir_model if self.dir_model.exists() else None
        # use huggingface vocab to get all tokens
        vocab = HfVocab(path, added_tokens_path)
        tokens, scores, toktypes = zip(*vocab.all_tokens())
        assert len(tokens) == vocab.vocab_size
        # we need this to validate the size of the token_type embeddings
        # though currently we are passing all zeros to the token_type embeddings
        n_token_types = len(set(toktypes))
        self.gguf_writer.add_token_type_count(n_token_types)
        # convert to phantom space vocab
        def phantom(tok, typ):
            if tok.startswith(b"[") and tok.endswith(b"]"):
                return tok
            if tok.startswith(b"##"):
                return tok[2:]
            return b"\xe2\x96\x81" + tok
        tokens = [phantom(t, y) for t, y in zip(tokens, toktypes)]
        # set up bos and eos tokens (cls and sep)
        self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
        self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
        # add vocab to gguf
        self.gguf_writer.add_tokenizer_model("bert")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        # handle special tokens
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
    def write_tensors(self):
        tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
        tensors = dict(self.get_tensors())
        for name, data_torch in tensors.items():
            # we are only using BERT for embeddings so we don't need the pooling layer
            if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
                continue  # we don't need these
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
                print(f"Can not map tensor {name!r}")
                sys.exit()
            data = data_torch.squeeze().numpy()
            n_dims = len(data.shape)
            new_dtype: type[np.floating[Any]]
            if (
                self.ftype == 1 and name.endswith(".weight") and n_dims == 2
                and name != "embeddings.token_type_embeddings.weight"  # not used with get_rows, must be F32
            ):
                # if f16 desired, convert any float32 2-dim weight tensors to float16
                new_dtype = np.float16
            else:
                # if f32 desired, convert any float16 to float32
                new_dtype = np.float32
            print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
            if data.dtype != new_dtype:
                data = data.astype(new_dtype)
            self.gguf_writer.add_tensor(new_name, data)
 ###### CONVERSION LOGIC ######
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -87,7 +87,17 @@ int main(int argc, char ** argv) {
    }
    const int n_embd = llama_n_embd(model);
-    const auto * embeddings = llama_get_embeddings(ctx);
+    auto * embeddings = llama_get_embeddings(ctx);
    // l2-normalize embeddings
    float norm = 0;
    for (int i = 0; i < n_embd; i++) {
        norm += embeddings[i] * embeddings[i];
    }
    norm = sqrt(norm);
    for (int i = 0; i < n_embd; i++) {
        embeddings[i] /= norm;
    }
    for (int i = 0; i < n_embd; i++) {
        printf("%f ", embeddings[i]);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -98,7 +98,7 @@ static void write_logfile(
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
    if (signo == SIGINT) {
-        if (!is_interacting) {
+        if (!is_interacting && g_params->interactive) {
            is_interacting = true;
        } else {
            console::cleanup();
@ -392,7 +392,8 @@ int main(int argc, char ** argv) {
        LOG_TEE("\n");
    }
-    if (params.interactive) {
+    // ctrl+C handling
    {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
        struct sigaction sigint_action;
        sigint_action.sa_handler = sigint_handler;
@ -405,7 +406,9 @@ int main(int argc, char ** argv) {
        };
        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
    }
    if (params.interactive) {
        LOG_TEE("%s: interactive mode on.\n", __func__);
        if (!params.antiprompt.empty()) {
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -185,7 +185,7 @@ node index.js
    `ignore_eos`: Ignore end of stream token and continue generating (default: false).
-    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
+    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []).
    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -626,18 +626,36 @@ struct llama_server_context
            const int n_vocab = llama_n_vocab(model);
            for (const auto &el : *logit_bias)
            {
-                if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
+                if (el.is_array() && el.size() == 2)
                {
                    float bias;
                    if (el[1].is_number())
                    {
                        bias = el[1].get<float>();
                    }
                    else if (el[1].is_boolean() && !el[1].get<bool>())
                    {
                        bias = -INFINITY;
                    }
                    else
                    {
                        continue;
                    }
                    if (el[0].is_number_integer())
                    {
                        llama_token tok = el[0].get<llama_token>();
                        if (tok >= 0 && tok < n_vocab)
                        {
-                        if (el[1].is_number())
+                            slot->sparams.logit_bias[tok] = bias;
                        {
                            slot->sparams.logit_bias[tok] = el[1].get<float>();
                        }
-                        else if (el[1].is_boolean() && !el[1].get<bool>())
+                    }
                    else if (el[0].is_string())
                    {
-                            slot->sparams.logit_bias[tok] = -INFINITY;
+                        auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
                        for (auto tok : toks)
                        {
                            slot->sparams.logit_bias[tok] = bias;
                        }
                    }
                }
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1706732774,
+        "lastModified": 1707268954,
-        "narHash": "sha256-hqJlyJk4MRpcItGYMF+3uHe8HvxNETWvlGtLuVpqLU0=",
+        "narHash": "sha256-2en1kvde3cJVc3ZnTy8QeD2oKcseLFjYPLKhIGDanQ0=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "b8b232ae7b8b144397fdb12d20f592e5e7c1a64d",
+        "rev": "f8e2ebd66d097614d51a56a755450d4ae1632df1",
        "type": "github"
      },
      "original": {
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -150,8 +150,8 @@
 #define CUDA_USE_TENSOR_CORES
 #endif
-// max batch size to use MMQ kernels when tensor cores are available
+#define MMVQ_MAX_BATCH_SIZE  8 // max batch size to use MMVQ kernels
-#define MMQ_MAX_BATCH_SIZE 32
+#define  MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300
@ -5310,51 +5310,59 @@ template <bool need_check> static __global__ void
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
-#define MMVQ_NWARPS_NVIDIA    4
+template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
 #define MMVQ_NWARPS_AMD_RDNA2 1
 #define MMVQ_NWARPS_AMD_OLD   4
 template <int nwarps, int ncols_y_template, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-__launch_bounds__(nwarps*WARP_SIZE, 1) // tells the compiler to use as many registers as it wants
+// tell the compiler to use as many registers as it wants, see nwarps definition below
 __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void mul_mat_vec_q(
    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y_par, const int nrows_dst) {
+    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
-    const int ncols_y = ncols_y_template != 0 ? ncols_y_template : ncols_y_par;
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
    constexpr int nwarps              = 1;
    constexpr int rows_per_cuda_block = 1;
 #else
    constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
    constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
    const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
-    const int row = blockIdx.x;
+    const     int row0 = rows_per_cuda_block*blockIdx.x;
    const     int blocks_per_row_x = ncols_x / qk;
    const     int blocks_per_col_y = nrows_y / QK8_1;
-    const int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
+    constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
 // partial sum for each thread
-    float tmp[ncols_y_template != 0 ? ncols_y_template : 8] = {0.0f};
+    float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
    const block_q_t  * x = (const block_q_t  *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;
-    for (int i = tid / (qi/vdr); i < blocks_per_row_x; i += blocks_per_iter) {
+    for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
-        const int ibx = row*blocks_per_row_x + i; // x block index
+        const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
-        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
+        // x block quant index when casting the quants to int
-
+        const int kqs = vdr * (tid % (qi/vdr));
        const int iqs  = vdr * (tid % (qi/vdr)); // x block quant index when casting the quants to int
 #pragma unroll
        for (int j = 0; j < ncols_y; ++j) {
-            tmp[j] += vec_dot_q_cuda(&x[ibx], &y[j*blocks_per_col_y + iby], iqs);
+#pragma unroll
            for (int i = 0; i < rows_per_cuda_block; ++i) {
                tmp[j][i] += vec_dot_q_cuda(
                    &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
            }
        }
    }
-    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y_template != 0 ? ncols_y_template : 8][WARP_SIZE];
+    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
    if (threadIdx.y > 0) {
 #pragma unroll
        for (int j = 0; j < ncols_y; ++j) {
-            tmp_shared[threadIdx.y-1][j][threadIdx.x] = tmp[j];
+#pragma unroll
            for (int i = 0; i < rows_per_cuda_block; ++i) {
                tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
            }
        }
    }
    __syncthreads();
@ -5366,13 +5374,16 @@ static __global__ void mul_mat_vec_q(
 #pragma unroll
    for (int j = 0; j < ncols_y; ++j) {
 #pragma unroll
-        for (int i = 0; i < nwarps-1; ++i) {
+        for (int i = 0; i < rows_per_cuda_block; ++i) {
-            tmp[j] += tmp_shared[i][j][threadIdx.x];
+#pragma unroll
            for (int l = 0; l < nwarps-1; ++l) {
                tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
            }
            tmp[j][i] = warp_reduce_sum(tmp[j][i]);
        }
        tmp[j] = warp_reduce_sum(tmp[j]);
-        if (threadIdx.x == 0) {
+        if (threadIdx.x < rows_per_cuda_block) {
-            dst[j*nrows_dst + row] = tmp[j];
+            dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
        }
    }
 }
@ -6851,65 +6862,75 @@ static void mul_mat_vec_q_cuda(
    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
    GGML_ASSERT(ncols_x % qk == 0);
-    GGML_ASSERT(ncols_y <= 4);
+    GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
-    int nwarps;
+    int64_t nwarps = 1;
-    if (g_device_caps[id].cc >= CC_OFFSET_AMD) {
+    int64_t rows_per_cuda_block = 1;
        nwarps = g_device_caps[id].cc >= CC_RDNA2 ? MMVQ_NWARPS_AMD_RDNA2 : MMVQ_NWARPS_AMD_OLD;
    } else {
        nwarps = MMVQ_NWARPS_NVIDIA;
    }
-    const dim3 block_nums(nrows_x, 1, 1);
+    if (g_device_caps[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
        switch(ncols_y) {
            case 1:
                nwarps = 4;
                rows_per_cuda_block = 1;
                break;
            case 2:
            case 3:
            case 4:
                nwarps = 4;
                rows_per_cuda_block = 2;
                break;
            case 5:
            case 6:
            case 7:
            case 8:
                nwarps = 2;
                rows_per_cuda_block = 2;
                break;
            default:
                GGML_ASSERT(false);
                break;
        }
    }
    const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
    const dim3 block_nums(nblocks, 1, 1);
    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-    switch (nwarps) {
+    switch (ncols_y) {
        case 1: switch(ncols_y) {
        case 1:
-                mul_mat_vec_q<1, 1, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
-                    <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 2:
-                mul_mat_vec_q<1, 2, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
-                    <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 3:
-                mul_mat_vec_q<1, 3, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
-                    <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
        case 4:
-                mul_mat_vec_q<1, 4, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
-                    <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
-            default:
+        case 5:
-                GGML_ASSERT(false);
+            mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
-        } break;
+        case 6:
-        case 4: switch(ncols_y) {
+            mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
-            case 1:
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
                mul_mat_vec_q<4, 1, qk, qi, block_q_t, vdr, vec_dot>
                    <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
            break;
-            case 2:
+        case 7:
-                mul_mat_vec_q<4, 2, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
-                    <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
-            case 3:
+        case 8:
-                mul_mat_vec_q<4, 3, qk, qi, block_q_t, vdr, vec_dot>
+            mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
-                    <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
+                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
            break;
            case 4:
                mul_mat_vec_q<4, 4, qk, qi, block_q_t, vdr, vec_dot>
                    <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
                break;
            default:
                GGML_ASSERT(false);
                break;
        } break;
        default:
            GGML_ASSERT(false);
            break;
@ -9735,7 +9756,7 @@ static __global__ void k_compute_batched_ptrs(
    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
 }
-static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));
@ -9893,39 +9914,69 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
    int64_t min_compute_capability = INT_MAX;
    bool any_pascal_with_slow_fp16 = false;
    if (split) {
        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
        auto & tensor_split = buft_ctx->tensor_split;
        for (int id = 0; id < g_device_count; ++id) {
-            if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
+            // skip devices that are not going to do any work:
            if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
                continue;
            }
            if (min_compute_capability > g_device_caps[id].cc) {
                min_compute_capability = g_device_caps[id].cc;
            }
            if (g_device_caps[id].cc == 610) {
                any_pascal_with_slow_fp16 = true;
            }
        }
    } else {
        min_compute_capability    = g_device_caps[g_main_device].cc;
        any_pascal_with_slow_fp16 = g_device_caps[g_main_device].cc == 610;
    }
    // check data types and tensor shapes for custom matrix multiplication kernels:
    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
    bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
    bool              use_mul_mat_q =  ggml_cuda_supports_mmq(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
-    bool               use_mul_mat_q = ggml_is_quantized(src0->type);
+
 #ifdef CUDA_USE_TENSOR_CORES
    use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
 #endif // CUDA_USE_TENSOR_CORES
 #else
-    const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
+    // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
-    bool               use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
+    const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
    // mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
    use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
    use_mul_mat_q     = use_mul_mat_q     && min_compute_capability >= MIN_CC_DP4A;
 #ifdef CUDA_USE_TENSOR_CORES
    // when tensor cores are available, use them for large batch size
    // ref: https://github.com/ggerganov/llama.cpp/pull/3776
-    use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
+    use_mul_mat_q     = use_mul_mat_q     && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
 #endif // CUDA_USE_TENSOR_CORES
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    use_mul_mat_q = use_mul_mat_q && ggml_cuda_supports_mmq(src0->type);
+    // if mmvq is available it's a better choice than dmmv:
 #ifndef GGML_CUDA_FORCE_DMMV
    use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
 #endif // GGML_CUDA_FORCE_DMMV
    // debug helpers
    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
@ -9943,34 +9994,16 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
    } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
        // KQ + KQV multi-batch
-        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
+        ggml_cuda_mul_mat_batched_cublas(src0, src1, dst);
-    } else if (src0->type == GGML_TYPE_F32) {
+    } else if (use_dequantize_mul_mat_vec) {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
 #ifdef GGML_CUDA_FORCE_DMMV
            const bool use_mul_mat_vec_q = false;
 #else
            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
 #endif // GGML_CUDA_FORCE_DMMV
            if (use_mul_mat_vec_q) {
                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
            } else {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
-            }
+    } else if (use_mul_mat_vec_q) {
        } else {
            if (src1->ne[1] <= 4 && min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32) {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
    } else if (use_mul_mat_q) {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
    } else {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
    }
        }
    } else {
        GGML_ASSERT(false);
    }
 }
 #if 0
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -49,6 +49,8 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define UNUSED GGML_UNUSED
 #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
@ -3677,15 +3679,92 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    assert(n % qk == 0);
 #if defined(__ARM_FEATURE_MATMUL_INT8)
    assert((nrc == 2) || (nrc == 1));
 #else
    assert(nrc == 1);
 #endif
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q4_0 * restrict x = vx;
    const block_q8_0 * restrict y = vy;
 #if defined(__ARM_FEATURE_MATMUL_INT8)
    if (nrc == 2) {
        const block_q4_0 * restrict vx0 = vx;
        const block_q4_0 * restrict vx1 = vx + bx;
        const block_q8_0 * restrict vy0 = vy;
        const block_q8_0 * restrict vy1 = vy + by;
        float32x4_t sumv0 = vdupq_n_f32(0.0f);
        for (int i = 0; i < nb; i++) {
            const block_q4_0 * restrict b_x0 = &vx0[i];
            const block_q4_0 * restrict b_x1 = &vx1[i];
            const block_q8_0 * restrict b_y0 = &vy0[i];
            const block_q8_0 * restrict b_y1 = &vy1[i];
            const uint8x16_t m4b = vdupq_n_u8(0x0F);
            const int8x16_t  s8b = vdupq_n_s8(0x8);
            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
            // 4-bit -> 8-bit
            const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
            const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
            const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
            const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
            // sub 8
            const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
            const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
            const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
            const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
            // load y
            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
            float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
                                 GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
                                 GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
                                 GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
                                                                                l1, r1)), l2, r2)), l3, r3))), scale);
        }
        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
        vst1_f32(s, vget_low_f32(sumv2));
        vst1_f32(s + bs, vget_high_f32(sumv2));
        return;
    }
 #endif
 #if defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
    float32x4_t sumv1 = vdupq_n_f32(0.0f);
@ -3967,15 +4046,93 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
 #endif
 }
-void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    const int qk = QK8_1;
    const int nb = n / qk;
    assert(n % qk == 0);
 #if defined(__ARM_FEATURE_MATMUL_INT8)
    assert((nrc == 2) || (nrc == 1));
 #else
    assert(nrc == 1);
 #endif
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q4_1 * restrict x = vx;
    const block_q8_1 * restrict y = vy;
 #if defined(__ARM_FEATURE_MATMUL_INT8)
    if (nrc == 2) {
        const block_q4_1 * restrict vx0 = vx;
        const block_q4_1 * restrict vx1 = vx + bx;
        const block_q8_1 * restrict vy0 = vy;
        const block_q8_1 * restrict vy1 = vy + by;
        float32x4_t sumv0 = vdupq_n_f32(0.0f);
        float32x4_t summs0 = vdupq_n_f32(0.0f);
        for (int i = 0; i < nb; i++) {
            const block_q4_1 * restrict b_x0 = &vx0[i];
            const block_q4_1 * restrict b_x1 = &vx1[i];
            const block_q8_1 * restrict b_y0 = &vy0[i];
            const block_q8_1 * restrict b_y1 = &vy1[i];
            float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
                                   GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
                                   GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
                                   GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
            summs0 += summs_t;
            const uint8x16_t m4b = vdupq_n_u8(0x0F);
            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
            // 4-bit -> 8-bit
            const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
            const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
            const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
            const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
            // load y
            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
            // mmla into int32x4_t
            float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
                                 GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
                                 GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
                                 GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
                                                                                l1, r1)), l2, r2)), l3, r3))), scale);
        }
        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
        sumv2 = sumv2 + summs0;
        vst1_f32(s, vget_low_f32(sumv2));
        vst1_f32(s + bs, vget_high_f32(sumv2));
        return;
    }
 #endif
    // TODO: add WASM SIMD
 #if defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
@ -4107,12 +4264,17 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
 #endif
 }
-void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    assert(n % qk == 0);
    assert(qk == QK5_0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q5_0 * restrict x = vx;
    const block_q8_0 * restrict y = vy;
@ -4393,12 +4555,17 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
 #endif
 }
-void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    const int qk = QK8_1;
    const int nb = n / qk;
    assert(n % qk == 0);
    assert(qk == QK5_1);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q5_1 * restrict x = vx;
    const block_q8_1 * restrict y = vy;
@ -4692,15 +4859,79 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
 #endif
 }
-void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    assert(n % qk == 0);
 #if defined(__ARM_FEATURE_MATMUL_INT8)
    assert((nrc == 2) || (nrc == 1));
 #else
    assert(nrc == 1);
 #endif
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q8_0 * restrict x = vx;
    const block_q8_0 * restrict y = vy;
 #if defined(__ARM_FEATURE_MATMUL_INT8)
    if (nrc == 2) {
        const block_q8_0 * restrict vx0 = vx;
        const block_q8_0 * restrict vx1 = vx + bx;
        const block_q8_0 * restrict vy0 = vy;
        const block_q8_0 * restrict vy1 = vy + by;
        float32x4_t sumv0 = vdupq_n_f32(0.0f);
        for (int i = 0; i < nb; i++) {
            const block_q8_0 * restrict b_x0 = &vx0[i];
            const block_q8_0 * restrict b_y0 = &vy0[i];
            const block_q8_0 * restrict b_x1 = &vx1[i];
            const block_q8_0 * restrict b_y1 = &vy1[i];
            const int8x16_t x0_l = vld1q_s8(b_x0->qs);
            const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
            const int8x16_t x1_l = vld1q_s8(b_x1->qs);
            const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
            // load y
            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
            float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
                             GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
                             GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
                             GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
                                                                                       l1, r1)), l2, r2)), l3, r3))), scale);
        }
        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
        vst1_f32(s, vget_low_f32(sumv2));
        vst1_f32(s + bs, vget_high_f32(sumv2));
        return;
    }
 #endif
 #if defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
    float32x4_t sumv1 = vdupq_n_f32(0.0f);
@ -4795,7 +5026,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
 }
 #if QK_K == 256
-void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q2_K * restrict x = vx;
    const block_q8_K * restrict y = vy;
@ -5171,7 +5407,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 #else
-void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q2_K * restrict x = vx;
    const block_q8_K * restrict y = vy;
@ -5429,8 +5670,13 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 #if QK_K == 256
-void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const uint32_t kmask1 = 0x03030303;
    const uint32_t kmask2 = 0x0f0f0f0f;
@ -5949,8 +6195,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 #else
-void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q3_K * restrict x = vx;
    const block_q8_K * restrict y = vy;
@ -6292,8 +6543,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 #if QK_K == 256
-void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q4_K * restrict x = vx;
    const block_q8_K * restrict y = vy;
@ -6648,8 +6904,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 }
 #else
-void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q4_K * restrict x = vx;
    const block_q8_K * restrict y = vy;
@ -6891,8 +7152,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 #endif
 #if QK_K == 256
-void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q5_K * restrict x = vx;
    const block_q8_K * restrict y = vy;
@ -7311,8 +7577,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 #else
-void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q5_K * restrict x = vx;
    const block_q8_K * restrict y = vy;
@ -7577,8 +7848,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 #if QK_K == 256
-void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q6_K * restrict x = vx;
    const block_q8_K * restrict y = vy;
@ -8009,8 +8285,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 #else
-void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_q6_K * restrict x = vx;
    const block_q8_K * restrict y = vy;
@ -8339,8 +8620,13 @@ static const int8_t keven_signs_q2xs[1024] = {
     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
 };
-void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_iq2_xxs * restrict x = vx;
    const block_q8_K    * restrict y = vy;
@ -8462,8 +8748,13 @@ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * res
 #endif
 }
-void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_iq2_xs * restrict x = vx;
    const block_q8_K   * restrict y = vy;
@ -8682,8 +8973,13 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
 }
 // TODO
-void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_iq3_xxs * restrict x = vx;
    const block_q8_K    * restrict y = vy;
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -245,20 +245,20 @@ void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_
 void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 // Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 //
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -27,6 +27,7 @@
 #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
 #define VK_VENDOR_ID_AMD 0x1002
 #define VK_VENDOR_ID_APPLE 0x106b
 #define VK_VENDOR_ID_INTEL 0x8086
 #define VK_VENDOR_ID_NVIDIA 0x10de
@ -2034,18 +2035,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
    return ctx->pipeline_matmul_f32_aligned_l.align;
 }
-static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
+static vk_pipeline* ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
 #endif
    if (bit16_x && bit16_y) {
-        if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
+        if (m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
            return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
        }
-        if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
+#ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
        return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
    }
    if (bit16_x && !bit16_y) {
        if (m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
            return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
        }
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
        return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
    }
    if (!bit16_x && bit16_y) {
        GGML_ASSERT(false);
    }
    if (m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
        return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
    }
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
    return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
 }
 static vk_pipeline* ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
    if (bit16_x && bit16_y) {
        return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
    }
    if (bit16_x && !bit16_y) {
        return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
    }
    if (!bit16_x && bit16_y) {
        GGML_ASSERT(false);
    }
    return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
 }
 static vk_pipeline* ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
    if (bit16_x && bit16_y) {
        return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
    }
    if (bit16_x && !bit16_y) {
        return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
    }
    if (!bit16_x && bit16_y) {
        GGML_ASSERT(false);
    }
    return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
 }
 static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
 #endif
    switch (ctx->device.lock()->vendor_id) {
    case VK_VENDOR_ID_AMD:
        return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned);
    case VK_VENDOR_ID_APPLE:
        return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned);
    case VK_VENDOR_ID_INTEL:
        return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned);
    }
    if (bit16_x && bit16_y) {
        if (m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
            return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
        }
        if (m <= 64 || n <= 64) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
@ -2057,13 +2140,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
        return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
    }
    if (bit16_x && !bit16_y) {
-        if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
+        if (m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
            return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
        }
-        if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
+        if (m <= 64 || n <= 64) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
@ -2078,13 +2161,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
        GGML_ASSERT(false);
    }
-    if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
+    if (m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
        return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
    }
-    if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
+    if (m <= 64 || n <= 64) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
--- a/ggml.c
+++ b/ggml.c
@ -428,8 +428,8 @@ int64_t ggml_cycles_per_ms(void) {
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
+static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
+static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
 static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
    [GGML_TYPE_I8] = {
@ -457,6 +457,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .is_quantized             = false,
        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
        .vec_dot_type             = GGML_TYPE_F32,
        .nrows                    = 1,
    },
    [GGML_TYPE_F16] = {
        .type_name                = "f16",
@ -468,6 +469,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
        .vec_dot_type             = GGML_TYPE_F16,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q4_0] = {
        .type_name                = "q4_0",
@ -479,6 +481,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
 #if defined (__ARM_FEATURE_MATMUL_INT8)
        .nrows                    = 2,
 #else
        .nrows                    = 1,
 #endif
    },
    [GGML_TYPE_Q4_1] = {
        .type_name                = "q4_1",
@ -490,6 +497,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
        .vec_dot_type             = GGML_TYPE_Q8_1,
 #if defined (__ARM_FEATURE_MATMUL_INT8)
        .nrows                    = 2,
 #else
        .nrows                    = 1,
 #endif
    },
    [4] = { // GGML_TYPE_Q4_2
        .type_name                = "DEPRECATED",
@ -501,6 +513,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = NULL,
        .vec_dot                  = NULL,
        .vec_dot_type             = GGML_TYPE_COUNT,
        .nrows                    = 1,
    },
    [5] = { // GGML_TYPE_Q4_3
        .type_name                = "DEPRECATED",
@ -512,6 +525,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = NULL,
        .vec_dot                  = NULL,
        .vec_dot_type             = GGML_TYPE_COUNT,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q5_0] = {
        .type_name                = "q5_0",
@ -523,6 +537,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q5_1] = {
        .type_name                = "q5_1",
@ -534,6 +549,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
        .vec_dot_type             = GGML_TYPE_Q8_1,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q8_0] = {
        .type_name                = "q8_0",
@ -545,6 +561,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
 #if defined (__ARM_FEATURE_MATMUL_INT8)
        .nrows                    = 2,
 #else
        .nrows                    = 1,
 #endif
    },
    [GGML_TYPE_Q8_1] = {
        .type_name                = "q8_1",
@ -554,6 +575,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float               = quantize_row_q8_1,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
        .vec_dot_type             = GGML_TYPE_Q8_1,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q2_K] = {
        .type_name                = "q2_K",
@ -565,6 +587,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q3_K] = {
        .type_name                = "q3_K",
@ -576,6 +599,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q4_K] = {
        .type_name                = "q4_K",
@ -587,6 +611,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q5_K] = {
        .type_name                = "q5_K",
@ -598,6 +623,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q6_K] = {
        .type_name                = "q6_K",
@ -609,6 +635,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ2_XXS] = {
        .type_name                = "iq2_xxs",
@ -620,6 +647,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = NULL,
        .vec_dot                  = ggml_vec_dot_iq2_xxs_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ2_XS] = {
        .type_name                = "iq2_xs",
@ -631,6 +659,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = NULL,
        .vec_dot                  = ggml_vec_dot_iq2_xs_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ3_XXS] = {
        .type_name                = "iq3_xxs",
@ -642,6 +671,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .from_float_reference     = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
        .vec_dot                  = ggml_vec_dot_iq3_xxs_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q8_K] = {
        .type_name                = "q8_K",
@ -1212,7 +1242,13 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
+static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
   assert(nrc == 1);
   UNUSED(nrc);
   UNUSED(bx);
   UNUSED(by);
   UNUSED(bs);
 #ifdef GGML_SIMD
    float sumf = 0.0f;
    const int np = (n & ~(GGML_F32_STEP - 1));
@ -1249,7 +1285,13 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
    *s = sumf;
 }
-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
+static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    ggml_float sumf = 0.0;
 #if defined(GGML_SIMD)
@ -1455,7 +1497,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
 #endif
 }
-inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s);   }
+inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
 inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
 inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
 inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);   }
@ -9992,6 +10034,7 @@ static void ggml_compute_forward_mul_mat(
    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
    int64_t           const vec_dot_num_rows      = type_traits[type].nrows;
    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
@ -10159,12 +10202,23 @@ static void ggml_compute_forward_mul_mat(
    const int64_t blck_0 = 16;
    const int64_t blck_1 = 16;
    // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
    int64_t nrc = vec_dot_num_rows;
    // TODO: currently the mmla kernels support only even numbered rows/cols.
    // this check can be removed once they are extended to support odd numbered rows/cols too
    if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
        nrc = 1;
    }
    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
    // attempt to reduce false-sharing (does not seem to make a difference)
-    float tmp[16];
+    // 16 * 2, accounting for mmla kernels
    float tmp[32];
    for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
        for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
-            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
+            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
                const int64_t i13 = (ir1/(ne12*ne1));
                const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
                const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
@ -10187,17 +10241,19 @@ static void ggml_compute_forward_mul_mat(
                    (src1_cont || src1->type != vec_dot_type
                     ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
                     : (i11*nb11 + i12*nb12 + i13*nb13));
                float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
                //}
-                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
+                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
-                    vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
+                    vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
                }
                for (int cn = 0; cn < nrc; ++cn) {
                    memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
                }
                memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
            }
        }
    }
@ -10386,7 +10442,7 @@ static void ggml_compute_forward_mul_mat_id(
                    //}
                    for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                        vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
+                        vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
                    }
                    memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
                }
@ -11568,7 +11624,7 @@ static void ggml_compute_forward_soft_max_back_f32(
        // linear runtime, no additional memory
        float dot_y_dy = 0;
-        ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy);
+        ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
        ggml_vec_cpy_f32 (nc, dx, dy);
        ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
        ggml_vec_mul_f32 (nc, dx, dx, y);
@ -12369,9 +12425,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
            const int i1n = i10*ne11;
            for (int i00 = 0; i00 < ne00; i00++) {
                float v = 0;
-                ggml_vec_dot_f16(ne02, &v,
+                ggml_vec_dot_f16(ne02, &v, 0,
-                        (ggml_fp16_t *)    wdata_src + i1n,
+                        (ggml_fp16_t *)    wdata_src + i1n, 0,
-                        (ggml_fp16_t *) wdata_kernel + i00*ne02);
+                        (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
                dst_data[i10*s0 + i00] += v;
            }
        }
@ -12466,9 +12522,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
            const int i1n = i10*ne11;
            for (int i00 = 0; i00 < ne00; i00++) {
                float v = 0;
-                ggml_vec_dot_f32(ne02, &v,
+                ggml_vec_dot_f32(ne02, &v, 0,
-                        wdata_src + i1n,
+                        wdata_src + i1n, 0,
-                        wdata_kernel + i00*ne02);
+                        wdata_kernel + i00*ne02, 0, 1);
                dst_data[i10*s0 + i00] += v;
            }
        }
@ -12783,9 +12839,9 @@ static void ggml_compute_forward_conv_transpose_2d(
                for (int i01 = 0; i01 < ne01; i01++) {
                    for (int i00 = 0; i00 < ne00; i00++) {
                        float v = 0;
-                        ggml_vec_dot_f16(ne03, &v,
+                        ggml_vec_dot_f16(ne03, &v, 0,
-                                wdata_src + i1n,
+                                wdata_src + i1n, 0,
-                                wdata_kernel + i01*ne00*ne03 + i00*ne03);
+                                wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
                        dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
                    }
                }
@ -13214,9 +13270,9 @@ static void ggml_compute_forward_flash_attn_f32(
            const int i1 = ik1;
            ggml_vec_dot_f32(neq0,
-                    S + i1,
+                    S + i1, 0,
-                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
-                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
        }
        // scale
@ -13299,9 +13355,9 @@ static void ggml_compute_forward_flash_attn_f32(
            const int iv3 = iq3;
            ggml_vec_dot_f32(masked_begin,
-                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
+                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)), 0,
-                    (float *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
+                    (float *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
-                    S);
+                    S, 0, 1);
        }
    }
 }
@ -13404,9 +13460,9 @@ static void ggml_compute_forward_flash_attn_f16(
                const int i1 = ik1;
                ggml_vec_dot_f16(neq0,
-                        S + i1,
+                        S + i1, 0,
-                        (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                        (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
-                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
            }
        } else {
            for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
@ -13508,9 +13564,9 @@ static void ggml_compute_forward_flash_attn_f16(
                const int iv3 = iq3;
                ggml_vec_dot_f16(nev0,
-                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
+                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)), 0,
-                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
+                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
-                        S16);
+                        S16, 0, 1);
            }
        } else {
            for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
@ -13652,9 +13708,9 @@ static void ggml_compute_forward_flash_ff_f16(
            const int i1 = ib01;
            ggml_vec_dot_f16(nea0,
-                    S + i1,
+                    S + i1, 0,
-                    (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
+                    (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
-                    (ggml_fp16_t *) ((char *)  a->data + ( ia1*nba1  +  ia2*nba2  +  ia3*nba3)));
+                    (ggml_fp16_t *) ((char *)  a->data + ( ia1*nba1  +  ia2*nba2  +  ia3*nba3)), 0, 1);
        }
        ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
@ -13677,9 +13733,9 @@ static void ggml_compute_forward_flash_ff_f16(
            for (int64_t ic = 0; ic < nec01; ++ic) {
                ggml_vec_dot_f16(neb01,
-                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1   + i2*nb2   + i3*nb3)),
+                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1   + i2*nb2   + i3*nb3)), 0,
-                        (ggml_fp16_t *) ((char *) c0->data  + (         ic*nbc01 + i2*nbc02 + i3*nbc03)),
+                        (ggml_fp16_t *) ((char *) c0->data  + (         ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
-                        S16);
+                        S16, 0, 1);
            }
            ggml_vec_add_f32(nec01,
@ -13866,9 +13922,9 @@ static void ggml_compute_forward_flash_attn_back_f32(
                    const int i1 = ik1;
                    ggml_vec_dot_f32(neq0,
-                            S + i1,
+                            S + i1, 0,
-                            (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                            (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
-                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
                }
                // scale
@ -14013,7 +14069,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
                // S = SM * (S - dot(SM, S))
                float dot_SM_gradSM = 0;
-                ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
+                ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
                ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
                ggml_vec_mul_f32 (masked_begin, S, S, SM);
@ -18382,7 +18438,7 @@ static enum ggml_opt_result linesearch_backtracking(
    }
    // compute the initial gradient in the search direction
-    ggml_vec_dot_f32(nx, &dginit, g, d);
+    ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
    // make sure that d points to a descent direction
    if (0 < dginit) {
@ -18432,7 +18488,7 @@ static enum ggml_opt_result linesearch_backtracking(
                return count;
            }
-            ggml_vec_dot_f32(nx, &dg, g, d);
+            ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
            // check the Wolfe condition
            if (dg < params->lbfgs.wolfe * dginit) {
@ -18693,8 +18749,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
        //     ys = y^t \cdot s    -> 1 / \rho.
        //     yy = y^t \cdot y.
        //
-        ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
+        ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
-        ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
+        ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
        lm_ys[end[0]] = ys;
@ -18713,7 +18769,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
        for (int i = 0; i < bound; ++i) {
            j[0] = (j[0] + m - 1) % m;
            // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
-            ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
+            ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
            lm_alpha[j[0]] /= lm_ys[j[0]];
            // q_{i} = q_{i+1} - \alpha_{i} y_{i}
            ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
@ -18723,7 +18779,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
        for (int i = 0; i < bound; ++i) {
            // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
-            ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
+            ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
            beta /= lm_ys[j[0]];
            // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
            ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
@ -20621,4 +20677,12 @@ int ggml_cpu_has_vsx(void) {
 #endif
 }
 int ggml_cpu_has_matmul_int8(void) {
 #if defined(__ARM_FEATURE_MATMUL_INT8)
    return 1;
 #else
    return 0;
 #endif
 }
 ////////////////////////////////////////////////////////////////////////////////
--- a/ggml.h
+++ b/ggml.h
@ -2290,6 +2290,7 @@ extern "C" {
    GGML_API int ggml_cpu_has_ssse3      (void);
    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_vsx        (void);
    GGML_API int ggml_cpu_has_matmul_int8(void);
    //
    // Internal types and functions exposed for tests and benchmarks
@ -2303,7 +2304,8 @@ extern "C" {
 #endif
    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
-    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+    typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                      const void * GGML_RESTRICT y, size_t by, int nrc);
    typedef struct {
        const char      * type_name;
@ -2315,6 +2317,7 @@ extern "C" {
        ggml_from_float_t from_float_reference;
        ggml_vec_dot_t    vec_dot;
        enum ggml_type    vec_dot_type;
        int64_t           nrows; // number of rows to process simultaneously;
    } ggml_type_traits_t;
    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -50,6 +50,7 @@ class Keys:
        VALUE_LENGTH      = "{arch}.attention.value_length"
        LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
        LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
        CAUSAL            = "{arch}.attention.causal"
    class Rope:
        DIMENSION_COUNT      = "{arch}.rope.dimension_count"
@ -63,6 +64,7 @@ class Keys:
        MODEL            = "tokenizer.ggml.model"
        LIST             = "tokenizer.ggml.tokens"
        TOKEN_TYPE       = "tokenizer.ggml.token_type"
        TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count"  # for BERT-style token types
        SCORES           = "tokenizer.ggml.scores"
        MERGES           = "tokenizer.ggml.merges"
        BOS_ID           = "tokenizer.ggml.bos_token_id"
@ -122,6 +124,7 @@ class MODEL_TENSOR(IntEnum):
    ATTN_OUT        = auto()
    ATTN_NORM       = auto()
    ATTN_NORM_2     = auto()
    ATTN_OUT_NORM   = auto()
    ATTN_ROT_EMBD   = auto()
    FFN_GATE_INP    = auto()
    FFN_NORM        = auto()
@ -134,6 +137,7 @@ class MODEL_TENSOR(IntEnum):
    FFN_UP_EXP      = auto()
    ATTN_Q_NORM     = auto()
    ATTN_K_NORM     = auto()
    LAYER_OUT_NORM  = auto()
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -178,6 +182,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.ATTN_ROT_EMBD:   "blk.{bid}.attn_rot_embd",
    MODEL_TENSOR.ATTN_Q_NORM:     "blk.{bid}.attn_q_norm",
    MODEL_TENSOR.ATTN_K_NORM:     "blk.{bid}.attn_k_norm",
    MODEL_TENSOR.ATTN_OUT_NORM:   "blk.{bid}.attn_output_norm",
    MODEL_TENSOR.FFN_GATE_INP:    "blk.{bid}.ffn_gate_inp",
    MODEL_TENSOR.FFN_NORM:        "blk.{bid}.ffn_norm",
    MODEL_TENSOR.FFN_GATE:        "blk.{bid}.ffn_gate",
@ -187,6 +192,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.FFN_GATE_EXP:    "blk.{bid}.ffn_gate.{xid}",
    MODEL_TENSOR.FFN_DOWN_EXP:    "blk.{bid}.ffn_down.{xid}",
    MODEL_TENSOR.FFN_UP_EXP:      "blk.{bid}.ffn_up.{xid}",
    MODEL_TENSOR.LAYER_OUT_NORM:  "blk.{bid}.layer_output_norm",
 }
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -262,17 +268,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
    ],
    MODEL_ARCH.BERT: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.TOKEN_EMBD_NORM,
        MODEL_TENSOR.TOKEN_TYPES,
        MODEL_TENSOR.POS_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_OUT_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.LAYER_OUT_NORM,
    ],
    MODEL_ARCH.MPT: [
        MODEL_TENSOR.TOKEN_EMBD,
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -357,6 +357,9 @@ class GGUFWriter:
    def add_layer_norm_rms_eps(self, value: float) -> None:
        self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
    def add_causal_attention(self, value: bool) -> None:
        self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
    def add_rope_dimension_count(self, count: int) -> None:
        self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
@ -387,6 +390,9 @@ class GGUFWriter:
    def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None:
        self.add_array(Keys.Tokenizer.TOKEN_TYPE, types)
    def add_token_type_count(self, value: int) -> None:
        self.add_uint32(Keys.Tokenizer.TOKEN_TYPE_COUNT, value)
    def add_token_scores(self, scores: Sequence[float]) -> None:
        self.add_array(Keys.Tokenizer.SCORES, scores)
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -30,6 +30,7 @@ class TensorNameMap:
        # Normalization of token embeddings
        MODEL_TENSOR.TOKEN_EMBD_NORM: (
            "word_embeddings_layernorm",  # bloom
            "embeddings.LayerNorm",       # bert
        ),
        # Position embeddings
@ -54,7 +55,6 @@ class TensorNameMap:
            "transformer.ln_f",                        # gpt2 gpt-j falcon
            "model.norm",                              # llama-hf baichuan internlm2
            "norm",                                    # llama-pth
            "embeddings.LayerNorm",                    # bert
            "transformer.norm_f",                      # mpt
            "ln_f",                                    # refact bloom qwen gpt2
            "language_model.encoder.final_layernorm",  # persimmon
@ -79,7 +79,6 @@ class TensorNameMap:
            "transformer.h.{bid}.ln_mlp",                           # falcon40b
            "model.layers.{bid}.input_layernorm",                   # llama-hf
            "layers.{bid}.attention_norm",                          # llama-pth
            "encoder.layer.{bid}.attention.output.LayerNorm",       # bert
            "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
            "model.layers.{bid}.ln1",                               # yi
            "h.{bid}.ln_1",                                         # gpt2
@ -155,6 +154,11 @@ class TensorNameMap:
            "model.layers.{bid}.attention.wo",                           # internlm2
        ),
        # Attention output norm
        MODEL_TENSOR.ATTN_OUT_NORM: (
            "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
        ),
        # Rotary embeddings
        MODEL_TENSOR.ATTN_ROT_EMBD: (
            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",        # llama-hf
@ -171,7 +175,6 @@ class TensorNameMap:
            "transformer.blocks.{bid}.norm_2",                               # mpt
            "model.layers.{bid}.post_attention_layernorm",                   # llama-hf
            "layers.{bid}.ffn_norm",                                         # llama-pth
            "encoder.layer.{bid}.output.LayerNorm",                          # bert
            "language_model.encoder.layers.{bid}.post_attention_layernorm",  # persimmon
            "model.layers.{bid}.ln2",                                        # yi
            "h.{bid}.ln_2",                                                  # gpt2
@ -266,6 +269,10 @@ class TensorNameMap:
        MODEL_TENSOR.ROPE_FREQS: (
            "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq",  # persimmon
        ),
        MODEL_TENSOR.LAYER_OUT_NORM: (
            "encoder.layer.{bid}.output.LayerNorm",  # bert
        )
    }
    mapping: dict[str, tuple[MODEL_TENSOR, str]]
--- a/llama.cpp
+++ b/llama.cpp
@ -196,6 +196,7 @@ enum llm_arch {
    LLM_ARCH_STARCODER,
    LLM_ARCH_PERSIMMON,
    LLM_ARCH_REFACT,
    LLM_ARCH_BERT,
    LLM_ARCH_BLOOM,
    LLM_ARCH_STABLELM,
    LLM_ARCH_QWEN,
@ -220,6 +221,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_STARCODER,       "starcoder" },
    { LLM_ARCH_PERSIMMON,       "persimmon" },
    { LLM_ARCH_REFACT,          "refact"    },
    { LLM_ARCH_BERT,            "bert"      },
    { LLM_ARCH_BLOOM,           "bloom"     },
    { LLM_ARCH_STABLELM,        "stablelm"  },
    { LLM_ARCH_QWEN,            "qwen"      },
@ -261,6 +263,7 @@ enum llm_kv {
    LLM_KV_ATTENTION_VALUE_LENGTH,
    LLM_KV_ATTENTION_LAYERNORM_EPS,
    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
    LLM_KV_ATTENTION_CAUSAL,
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_FREQ_BASE,
@ -273,6 +276,7 @@ enum llm_kv {
    LLM_KV_TOKENIZER_MODEL,
    LLM_KV_TOKENIZER_LIST,
    LLM_KV_TOKENIZER_TOKEN_TYPE,
    LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
    LLM_KV_TOKENIZER_SCORES,
    LLM_KV_TOKENIZER_MERGES,
    LLM_KV_TOKENIZER_BOS_ID,
@ -316,6 +320,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_VALUE_LENGTH,        "%s.attention.value_length"           },
    { LLM_KV_ATTENTION_LAYERNORM_EPS,       "%s.attention.layer_norm_epsilon"     },
    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,   "%s.attention.layer_norm_rms_epsilon" },
    { LLM_KV_ATTENTION_CAUSAL,              "%s.attention.causal"                 },
    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
@ -328,6 +333,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_TOKENIZER_MODEL,               "tokenizer.ggml.model"              },
    { LLM_KV_TOKENIZER_LIST,                "tokenizer.ggml.tokens"             },
    { LLM_KV_TOKENIZER_TOKEN_TYPE,          "tokenizer.ggml.token_type"         },
    { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,    "tokenizer.ggml.token_type_count"   },
    { LLM_KV_TOKENIZER_SCORES,              "tokenizer.ggml.scores"             },
    { LLM_KV_TOKENIZER_MERGES,              "tokenizer.ggml.merges"             },
    { LLM_KV_TOKENIZER_BOS_ID,              "tokenizer.ggml.bos_token_id"       },
@ -355,6 +361,7 @@ struct LLM_KV {
 enum llm_tensor {
    LLM_TENSOR_TOKEN_EMBD,
    LLM_TENSOR_TOKEN_EMBD_NORM,
    LLM_TENSOR_TOKEN_TYPES,
    LLM_TENSOR_POS_EMBD,
    LLM_TENSOR_OUTPUT,
    LLM_TENSOR_OUTPUT_NORM,
@ -536,6 +543,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_BERT,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
            { LLM_TENSOR_POS_EMBD,        "position_embd" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_output_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.layer_output_norm" },
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_BLOOM,
        {
@ -1440,6 +1464,11 @@ static llama_state g_state;
 // available llama models
 enum e_model {
    MODEL_UNKNOWN,
    MODEL_17M,
    MODEL_22M,
    MODEL_33M,
    MODEL_109M,
    MODEL_335M,
    MODEL_0_5B,
    MODEL_1B,
    MODEL_2B,
@ -1481,6 +1510,7 @@ struct llama_hparams {
    uint32_t n_ff;
    uint32_t n_expert = 0;
    uint32_t n_expert_used = 0;
    uint32_t n_vocab_type = 0; // for BERT-style token types
    float f_norm_eps;
    float f_norm_rms_eps;
@ -1493,6 +1523,8 @@ struct llama_hparams {
    float f_clamp_kqv;
    float f_max_alibi_bias;
    bool causal_attn = true;
    bool operator!=(const llama_hparams & other) const {
        if (this->vocab_only    != other.vocab_only)    return true;
@ -1720,6 +1752,7 @@ struct llama_model {
    llama_vocab   vocab;
    struct ggml_tensor * tok_embd;
    struct ggml_tensor * type_embd;
    struct ggml_tensor * pos_embd;
    struct ggml_tensor * tok_norm;
    struct ggml_tensor * tok_norm_b;
@ -1848,6 +1881,7 @@ struct llama_context {
    struct ggml_tensor * inp_pos;       // I32 [n_batch]
    struct ggml_tensor * inp_KQ_mask;   // F32 [n_ctx, n_batch]
    struct ggml_tensor * inp_K_shift;   // I32 [n_ctx]
    struct ggml_tensor * inp_sum;       // F32 [1, n_batch]
 #ifdef GGML_USE_MPI
    ggml_mpi_context * ctx_mpi = NULL;
@ -2827,6 +2861,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
    switch (type) {
        case LLAMA_VOCAB_TYPE_SPM:         return "SPM";
        case LLAMA_VOCAB_TYPE_BPE:         return "BPE";
        case LLAMA_VOCAB_TYPE_WPM:         return "WPM";
        default:                           return "unknown";
    }
 }
@ -2998,6 +3033,26 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
        case LLM_ARCH_BERT:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
                ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
                switch (hparams.n_layer) {
                    case 3:
                        model.type = e_model::MODEL_17M; break; // bge-micro
                    case 6:
                        model.type = e_model::MODEL_22M; break; // MiniLM-L6
                    case 12:
                        switch (hparams.n_embd) {
                            case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
                            case 768: model.type = e_model::MODEL_109M; break; // bge-base
                        } break;
                    case 24:
                        model.type = e_model::MODEL_335M; break; // bge-large
                }
            } break;
        case LLM_ARCH_BLOOM:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@ -3202,6 +3257,16 @@ static void llm_load_vocab(
            vocab.special_unk_id = -1;
            vocab.special_sep_id = -1;
            vocab.special_pad_id = -1;
        } else if (tokenizer_name == "bert") {
            vocab.type = LLAMA_VOCAB_TYPE_WPM;
            // default special tokens
            vocab.special_bos_id = 101;
            vocab.special_eos_id = 102;
            vocab.special_unk_id = 100;
            vocab.special_sep_id = -1;
            vocab.special_pad_id = -1;
            vocab.add_space_prefix = false;
        } else {
            LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
            LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
@ -3230,6 +3295,8 @@ static void llm_load_vocab(
    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
    if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
        vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
    } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
        vocab.linefeed_id = vocab.special_pad_id;
    } else {
        const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
@ -3567,6 +3634,7 @@ static bool llm_load_tensors(
        const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
        const int64_t n_embd_gqa   = n_embd_v_gqa;
        const int64_t n_vocab      = hparams.n_vocab;
        const int64_t n_vocab_type = hparams.n_vocab_type;
        const int64_t n_ff         = hparams.n_ff;
        GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
@ -3781,11 +3849,50 @@ static bool llm_load_tensors(
                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {64});
                    }
                } break;
            case LLM_ARCH_BERT:
                {
                    model.tok_embd   = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab});
                    model.type_embd  = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES,     "weight"), {n_embd, n_vocab_type});
                    model.pos_embd   = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,        "weight"), {n_embd, hparams.n_ctx_train});
                    model.tok_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
                    model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd});
                    for (int i = 0; i < n_layer; ++i) {
                        ggml_context * ctx_layer = ctx_for_layer(i);
                        ggml_context * ctx_split = ctx_for_layer_split(i);
                        auto & layer = model.layers[i];
                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i),   {n_embd});
                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i),   {n_embd_gqa});
                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i),   {n_embd_gqa});
                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
                    }
                } break;
            case LLM_ARCH_BLOOM:
                {
                    model.tok_embd   = ml.create_tensor(ctx_input,  tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab});
-                    model.tok_norm   = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
+                    model.tok_norm   = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
-                    model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd});
+                    model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd});
                    // output
                    {
@ -4737,6 +4844,7 @@ struct llm_build_context {
    const int32_t n_orig_ctx;
    const bool do_rope_shift;
    const bool causal_attn;
    const llm_build_cb & cb;
@ -4780,6 +4888,7 @@ struct llm_build_context {
        kv_head          (worst_case ? n_ctx - n_tokens : kv_self.head),
        n_orig_ctx       (cparams.n_yarn_orig_ctx),
        do_rope_shift    (worst_case || kv_self.has_shift),
        causal_attn      (hparams.causal_attn),
        cb               (cb),
        buf_compute_meta (lctx.buf_compute_meta) {
            // all initializations should be done in init()
@ -5623,6 +5732,100 @@ struct llm_build_context {
        return gf;
    }
    struct ggml_cgraph * build_bert() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
        const int64_t n_embd_head = hparams.n_embd_head_v;
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
        GGML_ASSERT(n_embd_head == hparams.n_rot);
        struct ggml_tensor * cur;
        struct ggml_tensor * inpL;
        // get input vectors with right size
        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
        struct ggml_tensor * inp_sum = ggml_view_1d(ctx0, lctx.inp_sum, n_tokens, 0);
        // construct input embeddings (token, type, position)
        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
        // token types are hardcoded to zero ("Sentence A")
        struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
        inpL = ggml_add(ctx0, inpL, type_row0);
        inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
        cb(inpL, "inp_embd", -1);
        // embed layer norm
        inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
        cb(inpL, "inp_norm", -1);
        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
        cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
        // iterate layers
        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * cur = inpL;
            // self-attention
            {
                struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
                cb(Qcur, "Qcur", il);
                struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
                cb(Kcur, "Kcur", il);
                struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
                cb(Vcur, "Vcur", il);
                // seems like we just need to do this for Q?
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
            // re-add the layer input
            cur = ggml_add(ctx0, cur, inpL);
            // attention layer norm
            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
            struct ggml_tensor * ffn_inp = cur;
            cb(ffn_inp, "ffn_inp", il);
            // feed-forward network
            cur = llm_build_ffn(ctx0, cur,
                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                    NULL,                      NULL,
                    model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                    NULL,
                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
            cb(cur, "ffn_out", il);
            // attentions bypass the intermediate layer
            cur = ggml_add(ctx0, cur, ffn_inp);
            // output layer norm
            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
            // input for next layer
            inpL = cur;
        }
        // final output
        cur = inpL;
        // pooling
        cur = ggml_mul_mat(ctx0, inp_sum, ggml_cont(ctx0, ggml_transpose(ctx0, cur)));
        cb(cur, "result_embed", -1);
        ggml_build_forward_expand(gf, cur);
        return gf;
    }
    struct ggml_cgraph * build_bloom() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@ -7045,6 +7248,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_refact();
            } break;
        case LLM_ARCH_BERT:
            {
                result = llm.build_bert();
            } break;
        case LLM_ARCH_BLOOM:
            {
                result = llm.build_bloom();
@ -7156,6 +7363,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        }
    }
    {
        assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
        float * data = (float *) lctx.inp_sum->data;
        for (int i = 0; i < batch.n_tokens; ++i) {
            data[i] = 1.0f/float(batch.n_tokens);
        }
    }
    if (kv_self.has_shift) {
        const int64_t n_ctx = cparams.n_ctx;
@ -7271,14 +7488,19 @@ static int llama_decode_internal(
    // the output is always the last tensor in the graph
    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
    GGML_ASSERT(strcmp(res->name, "result_output") == 0);
    // the embeddings could be the second to last tensor, or the third to last tensor
    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
    if (strcmp(res->name, "result_output") == 0) {
        // the embeddings could be the second to last tensor, or the third to last tensor
        if (strcmp(embeddings->name, "result_norm") != 0) {
            embeddings = gf->nodes[gf->n_nodes - 3];
            GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
        }
    } else if (strcmp(res->name, "result_embed") == 0) {
        embeddings = res;
        res = nullptr;
    } else {
        GGML_ASSERT(false);
    }
    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@ -7349,7 +7571,7 @@ static int llama_decode_internal(
    // extract logits
    // TODO: do not compute and extract logits if only embeddings are needed
    //       need to update the graphs to skip "result_output"
-    {
+    if (res) {
        auto & logits_out = lctx.logits;
 #ifndef NDEBUG
@ -7393,9 +7615,11 @@ static int llama_decode_internal(
    if (!lctx.embedding.empty()) {
        auto & embedding_out = lctx.embedding;
        const int64_t embed_pos = res ? n_embd * (n_tokens-1) : 0;
        embedding_out.resize(n_embd);
        ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
-        ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
+        ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embed_pos*sizeof(float), n_embd*sizeof(float));
        ggml_backend_synchronize(embeddings_backend);
    }
@ -7459,6 +7683,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
        GGML_ASSERT(false);
        return unicode_to_bytes_bpe(token_data.text);
    }
    case LLAMA_VOCAB_TYPE_WPM: {
        GGML_ASSERT(false);
    }
    default:
        GGML_ASSERT(false);
    }
@ -7471,6 +7698,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
            const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
            return vocab.token_to_id.at(buf);
        }
        case LLAMA_VOCAB_TYPE_WPM:
        case LLAMA_VOCAB_TYPE_BPE: {
            return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
        }
@ -7941,12 +8169,212 @@ private:
    llm_bigram_bpe::queue work_queue;
 };
-typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
+struct llm_tokenizer_wpm {
    llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
        auto * token_map = &vocab.token_to_id;
        // normalize and split by whitespace
        std::vector<std::string> words = preprocess(text);
        // bos token prepended already
        // find the longest tokens that form the words
        for (const std::string &word : words) {
            // skip empty words
            if (word.size() == 0) {
                continue;
            }
            // prepend phantom space
            std::string word1 = "\xe2\x96\x81" + word;
            int n = word1.size();
            // we're at the start of a new word
            int i = 0;
            bool match_any = false;
            // move through character position in word
            while (i < n) {
                // loop through possible match length
                bool match = false;
                for (int j = n; j > i; j--) {
                    auto it = token_map->find(word1.substr(i, j - i));
                    if (it != token_map->end()) {
                        output.push_back(it->second);
                        match = true;
                        match_any = true;
                        i = j;
                        break;
                    }
                }
                // must be an unknown character
                if (!match) {
                    i++;
                }
            }
            // we didn't find any matches for this word
            if (!match_any) {
                output.push_back(vocab.special_unk_id);
            }
        }
        // append eos token
        output.push_back(vocab.special_eos_id);
    }
    std::vector<std::string> preprocess(const std::string & text) {
        std::string ori_str = normalize(text);
        uint64_t ori_size = ori_str.size();
        // single punct / single symbol / single digit
        // baseline: add whitespace on the left and right of punct and chinese characters
        std::vector<std::string> words;
        std::string new_str = "";
        uint64_t i = 0;
        while (i < ori_size) {
            int utf_char_len = utf8_len(ori_str[i]);
            if ((utf_char_len == 1) && ispunct(ori_str[i])) {
                new_str += " ";
                new_str += ori_str[i];
                new_str += " ";
                i += 1;
            }
            else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
                new_str += " ";
                new_str += ori_str.substr(i, 3);
                new_str += " ";
                i += 3;
            }
            else {
                new_str += ori_str[i];
                i += 1;
            }
        }
        // split by whitespace
        uint64_t l = 0;
        uint64_t r = 0;
        while (r < new_str.size()) {
            // if is whitespace
            if (isspace(new_str[r])) {
                if (r > l) words.push_back(new_str.substr(l, (r - l)));
                l = r + 1;
                r = l;
            }
            else {
                r += 1;
            }
        }
        if (r > l) {
            words.push_back(new_str.substr(l, (r - l)));
        }
        return words;
    }
    std::string normalize(const std::string & text) {
        // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
        std::string text2 = strip_accents(text);
        for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
            char c = text2[i];
            if (c >= 'A' && c <= 'Z') {
                text2[i] = c - 'A' + 'a';
            }
        }
        return text2;
    }
    bool is_chinese_char(const std::string & str) {
        int len = str.length();
        unsigned int codepoint = 0;
        int num_bytes = 0;
        int i = 0;
        unsigned char ch = static_cast<unsigned char>(str[i]);
        if (ch <= 0x7f) {
            codepoint = ch;
            num_bytes = 1;
        } else if ((ch >> 5) == 0x06) {
            codepoint = ch & 0x1f;
            num_bytes = 2;
        } else if ((ch >> 4) == 0x0e) {
            codepoint = ch & 0x0f;
            num_bytes = 3;
        } else if ((ch >> 3) == 0x1e) {
            codepoint = ch & 0x07;
            num_bytes = 4;
        }
        for (int j = 1; j < num_bytes; ++j) {
            if (i + j >= len) {
                return false; // incomplete UTF-8 character
            }
            unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
            if ((next_ch >> 6) != 0x02) {
                return false; // invalid trailing byte
            }
            codepoint = (codepoint << 6) | (next_ch & 0x3f);
        }
        if ((codepoint >= 0x4E00  && codepoint <= 0x9FFF)  ||
            (codepoint >= 0x3400  && codepoint <= 0x4DBF)  ||
            (codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
            (codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
            (codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
            (codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
            (codepoint >= 0xF900  && codepoint <= 0xFAFF)  ||
            (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
            (codepoint >= 0x3000  && codepoint <= 0x303F)  ||
            (codepoint >= 0xFF00  && codepoint <= 0xFFEF)) {
            return true; // NOLINT
        }
        return false;
    }
    std::string strip_accents(const std::string & input_string) {
        std::string resultString;
        std::map<std::string, char> accent_map = {
            {"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
            {"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
            {"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
            {"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
            {"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
            {"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
            {"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
            {"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
            {"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
        };
        for (size_t i = 0; i <  input_string.length();) {
            int len = utf8_len(input_string[i]);
            std::string curChar = input_string.substr(i, len);
            auto iter = accent_map.find(curChar);
            if (iter != accent_map.end()) {
                resultString += iter->second;
            } else {
                resultString += curChar;
            }
            i += len;
        }
        return resultString;
    }
    static size_t utf8_len(char src) {
        const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
        uint8_t highbits = static_cast<uint8_t>(src) >> 4;
        return lookup[highbits];
    }
    const llama_vocab & vocab;
 };
 typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
 } FRAGMENT_BUFFER_VARIANT_TYPE;
-struct fragment_buffer_variant{
+struct fragment_buffer_variant {
    fragment_buffer_variant(llama_vocab::id _token)
    :
        type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
@ -7976,8 +8404,7 @@ struct fragment_buffer_variant{
 // #define PRETOKENIZERDEBUG
-static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
+static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
 {
    // for each special token
    for (const auto & st: vocab.special_tokens_cache) {
        const auto & special_token = st.first;
@ -8095,10 +8522,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
    switch (vocab.type) {
        case LLAMA_VOCAB_TYPE_SPM:
            {
-                for (const auto & fragment: fragment_buffer)
+                for (const auto & fragment: fragment_buffer) {
-                {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
                    {
                        // without adding this leading whitespace, we do not get the same results as the original tokenizer
                        // TODO: It's likely possible to get rid of this string copy entirely
@ -8118,19 +8543,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                        llm_tokenizer_spm tokenizer(vocab);
                        llama_escape_whitespace(raw_text);
                        tokenizer.tokenize(raw_text, output);
-                    }
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                    else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                    {
                        output.push_back(fragment.token);
                    }
                }
            } break;
        case LLAMA_VOCAB_TYPE_BPE:
            {
-                for (const auto & fragment: fragment_buffer)
+                for (const auto & fragment: fragment_buffer) {
-                {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
                    {
                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 #ifdef PRETOKENIZERDEBUG
@ -8138,9 +8559,23 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
 #endif
                        llm_tokenizer_bpe tokenizer(vocab);
                        tokenizer.tokenize(raw_text, output);
                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                        output.push_back(fragment.token);
                    }
-                    else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                }
            } break;
        case LLAMA_VOCAB_TYPE_WPM:
            {
                for (const auto & fragment: fragment_buffer) {
                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 #ifdef PRETOKENIZERDEBUG
                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
                        llm_tokenizer_wpm tokenizer(vocab);
                        tokenizer.tokenize(raw_text, output);
                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                        output.push_back(fragment.token);
                    }
                }
@ -10804,7 +11239,7 @@ struct llama_context * llama_new_context_with_model(
        // graph inputs
        {
            ggml_init_params init_params = {
-                /* .mem_size   */ ggml_tensor_overhead()*5,
+                /* .mem_size   */ ggml_tensor_overhead()*7,
                /* .mem_buffer */ nullptr,
                /* .no_alloc   */ true,
            };
@ -10815,12 +11250,14 @@ struct llama_context * llama_new_context_with_model(
            ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
            ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
            ctx->inp_sum     = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch);
            ggml_set_name(ctx->inp_tokens,  "inp_tokens");
            ggml_set_name(ctx->inp_embd,    "inp_embd");
            ggml_set_name(ctx->inp_pos,     "inp_pos");
            ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
            ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
            ggml_set_name(ctx->inp_sum,     "inp_sum");
            ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
@ -11755,6 +12192,7 @@ static std::string llama_decode_text(const std::string & text) {
 int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
    if (0 <= token && token < llama_n_vocab(model)) {
        switch (llama_vocab_get_type(model->vocab)) {
        case LLAMA_VOCAB_TYPE_WPM:
        case LLAMA_VOCAB_TYPE_SPM: {
            // NOTE: we accept all unsupported token types,
            // suppressing them like CONTROL tokens.
@ -11878,6 +12316,7 @@ const char * llama_print_system_info(void) {
    s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
    s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
    s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
    s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
    return s.c_str();
 }
--- a/llama.h
+++ b/llama.h
@ -61,6 +61,7 @@ extern "C" {
    enum llama_vocab_type {
        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
        LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
    };
    enum llama_token_type {
--- a/pocs/vdot/q8dot.cpp
+++ b/pocs/vdot/q8dot.cpp
@ -156,8 +156,8 @@ int main(int argc, char** argv) {
        t1 = std::chrono::high_resolution_clock::now();
        float fs;
-        if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, x40.data(), y.data());
+        if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
-        else funcs.vec_dot(kVecSize * QK4_1, &fs, x41.data(), y.data());
+        else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
        t2 = std::chrono::high_resolution_clock::now();
        t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
        if (iloop > 3) ggml.addResult(fs, t);
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@ -284,8 +284,8 @@ int main(int argc, char** argv) {
        else {
            auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
            vdot.from_float(y1.data(), q8.data(), kVecSize);
-            if (useQ4_1) funcs.vec_dot(kVecSize, &result, q41.data(), q8.data());
+            if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
-            else funcs.vec_dot(kVecSize, &result, q40.data(), q8.data());
+            else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
        }
        sumq += result;
        t2 = std::chrono::high_resolution_clock::now();
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@ -87,7 +87,7 @@ static float dot_product_error(
    vdot.from_float(test_data2, tmp_q2.data(), test_size);
    float result = INFINITY;
-    qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data());
+    qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
    const float dot_ref = dot_product(test_data1, test_data2, test_size);
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@ -346,7 +346,7 @@ int main(int argc, char * argv[]) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                    auto quantize_fn = [&](void) -> float {
                        float result;
-                        qfns.vec_dot(size, &result, test_q1, test_q2);
+                        qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
                        return result;
                    };
                    size_t quantized_size = ggml_row_size(type, size);