Merge branch 'master' of https://github.com/ggerganov/llama.cpp into hk

2024-07-22 13:56:52 -08:00 · 2024-07-22 13:56:52 -08:00 · 8ea2402195
commit 8ea2402195
parent d8f782acc3 081fe431aa
37 changed files with 1664 additions and 1401 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -860,7 +860,7 @@ jobs:
          mkdir build
          cd build
          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
      - name: Determine tag name
        id: tag
--- a/README.md
+++ b/README.md
@ -3,7 +3,7 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
 [![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -48,7 +48,7 @@ class Model:
    dir_model: Path
    ftype: gguf.LlamaFileType
-    fname_out: Path | None
+    fname_out: Path
    is_big_endian: bool
    endianess: gguf.GGUFEndian
    use_temp_file: bool
@ -62,11 +62,12 @@ class Model:
    gguf_writer: gguf.GGUFWriter
    model_name: str | None
    metadata_override: Path | None
    dir_model_card: Path
    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path | None, is_big_endian: bool = False,
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
                 use_temp_file: bool = False, eager: bool = False,
                 metadata_override: Path | None = None, model_name: str | None = None,
                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
@ -90,6 +91,7 @@ class Model:
        self.tensor_names = None
        self.metadata_override = metadata_override
        self.model_name = model_name
        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
        # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
        if self.ftype == gguf.LlamaFileType.GUESSED:
@ -237,6 +239,10 @@ class Model:
            self.gguf_writer.add_expert_used_count(n_experts_used)
            logger.info(f"gguf: experts used count = {n_experts_used}")
        if (head_dim := self.hparams.get("head_dim")) is not None:
            self.gguf_writer.add_key_length(head_dim)
            self.gguf_writer.add_value_length(head_dim)
        self.gguf_writer.add_file_type(self.ftype)
        logger.info(f"gguf: file type = {self.ftype}")
@ -345,7 +351,7 @@ class Model:
        total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
-        self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model, self.model_name, total_params)
+        self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
        # Fallback to model directory name if metadata name is still missing
        if self.metadata.name is None:
@ -359,27 +365,22 @@ class Model:
        output_type: str = self.ftype.name.partition("_")[2]
        # Filename Output
-        # Note: `not is_dir()` is used because `.is_file()` will not detect
+        if self.fname_out.is_dir():
        #       file template strings as it doesn't actually exist as a file
        if self.fname_out is not None and not self.fname_out.is_dir():
            # Output path is a custom defined templated filename
            # Process templated file name with the output ftype, useful with the "auto" ftype
            self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
        else:
            # Generate default filename based on model specification and available metadata
            if not vocab_only:
                fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
            else:
                fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
-            # Check if preferred output directory path was provided
+            # Use the default filename
            if self.fname_out is not None and self.fname_out.is_dir():
                # output path is a directory
            self.fname_out = self.fname_out / f"{fname_default}.gguf"
        else:
-                # output in the same directory as the model by default
+            # Output path is a custom defined templated filename
-                self.fname_out = self.dir_model / f"{fname_default}.gguf"
+            # Note: `not is_dir()` is used because `.is_file()` will not detect
            #       file template strings as it doesn't actually exist as a file
            # Process templated file name with the output ftype, useful with the "auto" ftype
            self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
        self.set_type()
@ -593,6 +594,15 @@ class Model:
        if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
            # ref: https://huggingface.co/core42/jais-13b
            res = "jais"
        if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
            # ref: https://huggingface.co/WisdomShell/CodeShell-7B
            res = "codeshell"
        if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
            # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
            res = "tekken"
        if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
            # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
            res = "smollm"
        if res is None:
            logger.warning("\n")
@ -733,7 +743,7 @@ class Model:
                added_tokens_json = json.load(f)
                for key in added_tokens_json:
                    token_id = added_tokens_json[key]
-                    if (token_id >= vocab_size):
+                    if token_id >= vocab_size:
                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                        continue
@ -750,7 +760,8 @@ class Model:
                    token_id = int(token_id)
                    token: str = token_data["content"]
                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert tokens[token_id] == token.encode("utf-8")
+                        if tokens[token_id] != token.encode("utf-8"):
                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
                    if token_data.get("special") or self.does_token_look_special(token):
                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
                    else:
@ -1309,6 +1320,7 @@ class RefactModel(Model):
        special_vocab._set_special_token("prefix", 1)
        special_vocab._set_special_token("suffix", 3)
        special_vocab._set_special_token("middle", 2)
        special_vocab.chat_template = None  # do not add it twice
        special_vocab.add_to_gguf(self.gguf_writer)
    def set_gguf_parameters(self):
@ -1479,7 +1491,12 @@ class LlamaModel(Model):
        super().set_gguf_parameters()
        hparams = self.hparams
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+
        if "head_dim" in hparams:
            rope_dim = hparams["head_dim"]
        else:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(rope_dim)
        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
            if self.hparams["rope_scaling"].get("type") == "linear":
@ -1994,7 +2011,7 @@ class Phi3MiniModel(Model):
                for key in added_tokens_json:
                    token_id = added_tokens_json[key]
-                    if (token_id >= vocab_size):
+                    if token_id >= vocab_size:
                        logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                        continue
@ -2011,7 +2028,8 @@ class Phi3MiniModel(Model):
                    token_id = int(token_id)
                    token = foken_data["content"].encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert tokens[token_id] == token
+                        if tokens[token_id] != token:
                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2027,7 +2045,8 @@ class Phi3MiniModel(Model):
                    token_id = int(foken_data["id"])
                    token = foken_data["content"].encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert tokens[token_id] == token
+                        if tokens[token_id] != token:
                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2068,7 +2087,7 @@ class Phi3MiniModel(Model):
        # write rope scaling for long context (128k) model
        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if (rope_scaling is None):
+        if rope_scaling is None:
            return
        scale = max_pos_embds / orig_max_pos_embds
@ -2266,7 +2285,8 @@ class InternLM2Model(Model):
                        chat_eos_token_id = token_id
                    token = token.encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert(tokens[token_id] == token)
+                        if tokens[token_id] != token:
                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2285,7 +2305,8 @@ class InternLM2Model(Model):
                        chat_eos_token_id = token_id
                    token = token.encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert(tokens[token_id] == token)
+                        if tokens[token_id] != token:
                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -2471,6 +2492,7 @@ class GemmaModel(Model):
        special_vocab._set_special_token("middle", 68)
        special_vocab._set_special_token("fsep",   70)
        special_vocab._set_special_token("eot",    107)
        special_vocab.chat_template = None  # do not add it twice
        special_vocab.add_to_gguf(self.gguf_writer)
        self.gguf_writer.add_add_space_prefix(False)
@ -2712,7 +2734,7 @@ class JinaBertV2Model(BertModel):
            yield name, data
-    def set_vocab(self, *args, **kwargs):
+    def set_vocab(self):
        tokenizer_class = 'BertTokenizer'
        with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
            tokenizer_class = json.load(f)['tokenizer_class']
@ -2860,7 +2882,7 @@ class ArcticModel(Model):
                    added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
                    for token_id, token_json in added_tokens_decoder.items():
                        token_id = int(token_id)
-                        if (token_id >= vocab_size):
+                        if token_id >= vocab_size:
                            logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                            continue
@ -3109,7 +3131,7 @@ class T5Model(Model):
                added_tokens_json = json.load(f)
                for key in added_tokens_json:
                    token_id = added_tokens_json[key]
-                    if (token_id >= vocab_size):
+                    if token_id >= vocab_size:
                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                        continue
@ -3420,7 +3442,6 @@ class ChatGLMModel(Model):
        special_vocab.add_to_gguf(self.gguf_writer)
    def set_gguf_parameters(self):
        self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
        n_head_kv = self.hparams.get("multi_query_group_num", n_head)
@ -3625,10 +3646,10 @@ def main() -> None:
        logger.error("Error: Cannot use temp file when splitting")
        sys.exit(1)
    fname_out = None
    if args.outfile is not None:
        fname_out = args.outfile
    else:
        fname_out = dir_model
    logger.info(f"Loading model: {dir_model.name}")
@ -3659,7 +3680,6 @@ def main() -> None:
        else:
            logger.info("Exporting model...")
            model_instance.write()
            assert model_instance.fname_out is not None
            out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
            logger.info(f"Model successfully exported to {out_path}")
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
-chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 if len(sys.argv) == 2:
    token = sys.argv[1]
@ -91,6 +91,9 @@ models = [
    {"name": "gemma-2",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
    {"name": "jais",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
    {"name": "t5",             "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
    {"name": "codeshell",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
    {"name": "tekken",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
    {"name": "smollm",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
 ]
@ -99,8 +102,8 @@ def download_file_with_auth(url, token, save_path):
    response = sess.get(url, headers=headers)
    response.raise_for_status()
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    with open(save_path, 'wb') as f:
+    with open(save_path, 'wb') as downloaded_file:
-        f.write(response.content)
+        downloaded_file.write(response.content)
    logger.info(f"File {save_path} downloaded successfully")
@ -159,7 +162,7 @@ for model in models:
        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
        continue  # Skip to the next model if the tokenizer can't be loaded
-    chktok = tokenizer.encode(chktxt)
+    chktok = tokenizer.encode(CHK_TXT)
    chkhsh = sha256(str(chktok).encode()).hexdigest()
    logger.info(f"model: {name}")
@ -191,7 +194,7 @@ src_func = f"""
        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
        # use in llama.cpp to implement the same pre-tokenizer
-        chktxt = {repr(chktxt)}
+        chktxt = {repr(CHK_TXT)}
        chktok = tokenizer.encode(chktxt)
        chkhsh = sha256(str(chktok).encode()).hexdigest()
@ -287,7 +290,7 @@ tests = [
    "333333333",
    "Cửa Việt", # llama-bpe fails on this
    " discards",
-    chktxt,
+    CHK_TXT,
 ]
 # write the tests to ./models/ggml-vocab-{name}.gguf.inp
--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@ -132,6 +132,10 @@ class Tensor:
 class GGMLModel:
    file_format: GGMLFormat
    format_version: int
    def __init__(self):
        self.hyperparameters = None
        self.vocab = None
@ -290,7 +294,7 @@ class GGMLToGGUF:
        if self.vocab_override is not None:
            vo = self.vocab_override
            logger.info('* Adding vocab item(s)')
-            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
+            for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
                tokens.append(vbytes)
                scores.append(score)
                toktypes.append(ttype)
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@ -290,7 +290,7 @@ if __name__ == '__main__':
        fname_out = args.outfile
    else:
        # output in the same directory as the model by default
-        fname_out = dir_lora / 'ggml-lora-{ftype}.gguf'
+        fname_out = dir_lora
    if os.path.exists(input_model):
        # lazy import load_file only if lora is in safetensors format.
@ -304,12 +304,6 @@ if __name__ == '__main__':
    # load base model
    logger.info(f"Loading base model: {dir_base_model.name}")
    hparams = Model.load_hparams(dir_base_model)
    with open(lora_config, "r") as f:
        lparams: dict[str, Any] = json.load(f)
    alpha: float = lparams["lora_alpha"]
    with torch.inference_mode():
        try:
            model_class = Model.from_model_architecture(hparams["architectures"][0])
@ -320,12 +314,21 @@ if __name__ == '__main__':
        class LoraModel(model_class):
            model_arch = model_class.model_arch
            lora_alpha: float
            def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
                super().__init__(*args, **kwargs)
                self.dir_model_card = dir_lora_model
                self.lora_alpha = float(lora_alpha)
            def set_type(self):
                self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
                self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
            def set_gguf_parameters(self):
-                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha))
+                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
                super().set_gguf_parameters()
            def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
@ -368,6 +371,11 @@ if __name__ == '__main__':
                    yield (dest_name + ".lora_a", lora_a)
                    yield (dest_name + ".lora_b", lora_b)
        with open(lora_config, "r") as f:
            lparams: dict[str, Any] = json.load(f)
        alpha: float = lparams["lora_alpha"]
        model_instance = LoraModel(
            dir_base_model,
            ftype,
@ -376,6 +384,8 @@ if __name__ == '__main__':
            use_temp_file=False,
            eager=args.no_lazy,
            dry_run=args.dry_run,
            dir_lora_model=dir_lora,
            lora_alpha=alpha,
        )
        logger.info("Exporting model...")
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) {
    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
    if (!ctx) {
        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
        return false;
    }
    printf("%s: version:      %d\n", __func__, gguf_get_version(ctx));
    printf("%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
    printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
-        return env->NewStringUTF("");
+        return nullptr;
    }
    auto new_token_chars = llama_token_to_piece(context, new_token_id);
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -26,11 +26,12 @@ actor LlamaContext {
    private var context: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]
    var is_done: Bool = false
    /// This variable is used to store temporarily invalid cchars
    private var temporary_invalid_cchars: [CChar]
-    var n_len: Int32 = 64
+    var n_len: Int32 = 1024
    var n_cur: Int32 = 0
    var n_decode: Int32 = 0
@ -160,6 +161,7 @@ actor LlamaContext {
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
            is_done = true
            let new_token_str = String(cString: temporary_invalid_cchars + [0])
            temporary_invalid_cchars.removeAll()
            return new_token_str
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@ -132,7 +132,7 @@ class LlamaState: ObservableObject {
        messageLog += "\(text)"
        Task.detached {
-            while await llamaContext.n_cur < llamaContext.n_len {
+            while await !llamaContext.is_done {
                let result = await llamaContext.completion_loop()
                await MainActor.run {
                    self.messageLog += "\(result)"
--- a/examples/pydantic_models_to_grammar_examples.py
+++ b/examples/pydantic_models_to_grammar_examples.py
@ -1,8 +1,15 @@
-# Function calling example using pydantic models.
+#!/usr/bin/env python3
 """Function calling example using pydantic models."""
 from __future__ import annotations
 import argparse
 import datetime
 import json
 import logging
 import textwrap
 import sys
 from enum import Enum
 from typing import Optional, Union
@ -12,30 +19,54 @@ from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert
                                        create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
-# Function to get completion on the llama.cpp server with grammar.
+def create_completion(host, prompt, gbnf_grammar):
-def create_completion(prompt, grammar):
+    """Calls the /completion API on llama-server.
    See
    https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
    """
    print(f"  Request:\n    Grammar:\n{textwrap.indent(gbnf_grammar, '      ')}\n    Prompt:\n{textwrap.indent(prompt.rstrip(), '      ')}")
    headers = {"Content-Type": "application/json"}
-    data = {"prompt": prompt, "grammar": grammar}
+    data = {"prompt": prompt, "grammar": gbnf_grammar}
-
+    result = requests.post(f"http://{host}/completion", headers=headers, json=data).json()
    response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
    data = response.json()
    assert data.get("error") is None, data
-
+    logging.info("Result: %s", result)
-    print(data["content"])
+    content = result["content"]
-    return data["content"]
+    print(f"  Model: {result['model']}")
    print(f"  Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), '    ')}")
    return content
 # A function for the agent to send a message to the user.
 class SendMessageToUser(BaseModel):
-    """
+    """Send a message to the User."""
    Send a message to the User.
    """
    chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
    message: str = Field(..., description="Message you want to send to the user.")
    def run(self):
-        print(self.message)
+        print(f"SendMessageToUser: {self.message}")
 def example_rce(host):
    """Minimal test case where the LLM call an arbitrary python function."""
    print("- example_rce")
    tools = [SendMessageToUser]
    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
        pydantic_model_list=tools, outer_object_name="function",
        outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
    system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
    user_message = "What is 42 * 42?"
    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
    text = create_completion(host, prompt, gbnf_grammar)
    json_data = json.loads(text)
    tools_map = {tool.__name__:tool for tool in tools}
    # This finds "SendMessageToUser":
    tool = tools_map.get(json_data["function"])
    if not tool:
        print(f"Error: unknown tool {json_data['function']}")
        return 1
    tool(**json_data["function_parameters"]).run()
    return 0
 # Enum for the calculator tool.
@ -46,11 +77,11 @@ class MathOperation(Enum):
    DIVIDE = "divide"
-# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
+# Simple pydantic calculator tool for the agent that can add, subtract,
 # multiply, and divide. Docstring and description of fields will be used in
 # system prompt.
 class Calculator(BaseModel):
-    """
+    """Perform a math operation on two numbers."""
    Perform a math operation on two numbers.
    """
    number_one: Union[int, float] = Field(..., description="First number.")
    operation: MathOperation = Field(..., description="Math operation to perform.")
    number_two: Union[int, float] = Field(..., description="Second number.")
@ -68,55 +99,61 @@ class Calculator(BaseModel):
            raise ValueError("Unknown operation.")
-# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM.
+def example_calculator(host):
-# pydantic_model_list is the list of pydanitc models
+    """Have the LLM ask to get a calculation done.
-# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated
+
-# outer_object_content is the name of outer object content.
+    Here the grammar gets generated by passing the available function models to
-# model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
+    generate_gbnf_grammar_and_documentation function. This also generates a
-# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
+    documentation usable by the LLM.
    pydantic_model_list is the list of pydantic models outer_object_name is an
    optional name for an outer object around the actual model object. Like a
    "function" object with "function_parameters" which contains the actual model
    object. If None, no outer object will be generated outer_object_content is
    the name of outer object content.
    model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
    fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
    """
    print("- example_calculator")
    tools = [SendMessageToUser, Calculator]
    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
-    pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function",
+        pydantic_model_list=tools, outer_object_name="function",
        outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
 print(gbnf_grammar)
 print(documentation)
    system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
-
+    user_message1 = "What is 42 * 42?"
-user_message = "What is 42 * 42?"
+    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant"
-prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
+    text = create_completion(host, prompt, gbnf_grammar)
-
+    json_data = json.loads(text)
-text = create_completion(prompt=prompt, grammar=gbnf_grammar)
+    expected = {
-# This should output something like this:
+        "function": "Calculator",
-# {
+        "function_parameters": {
-#     "function": "calculator",
+            "number_one": 42,
-#     "function_parameters": {
+            "operation": "multiply",
-#         "number_one": 42,
+            "number_two": 42
-#         "operation": "multiply",
+        }
-#         "number_two": 42
+    }
-#     }
+    if json_data != expected:
-# }
+        print("  Result is not as expected!")
-function_dictionary = json.loads(text)
+    tools_map = {tool.__name__:tool for tool in tools}
-if function_dictionary["function"] == "calculator":
+    # This finds "Calculator":
-    function_parameters = {**function_dictionary["function_parameters"]}
+    tool = tools_map.get(json_data["function"])
-
+    if not tool:
-    print(Calculator(**function_parameters).run())
+        print(f"Error: unknown tool {json_data['function']}")
-    # This should output: 1764
+        return 1
    result = tool(**json_data["function_parameters"]).run()
    print(f"  Call {json_data['function']} gave result {result}")
    return 0
 # A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text.
 class Category(Enum):
-    """
+    """The category of the book."""
    The category of the book.
    """
    Fiction = "Fiction"
    NonFiction = "Non-Fiction"
 class Book(BaseModel):
-    """
+    """Represents an entry about a book."""
    Represents an entry about a book.
    """
    title: str = Field(..., description="Title of the book.")
    author: str = Field(..., description="Author of the book.")
    published_year: Optional[int] = Field(..., description="Publishing year of the book.")
@ -125,33 +162,42 @@ class Book(BaseModel):
    summary: str = Field(..., description="Summary of the book.")
-# We need no additional parameters other than our list of pydantic models.
+def example_struct(host):
-gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book])
+    """A example structured output based on pydantic models.
    The LLM will create an entry for a Book database out of an unstructured
    text. We need no additional parameters other than our list of pydantic
    models.
    """
    print("- example_struct")
    tools = [Book]
    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools)
    system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
    text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
-
+    text = create_completion(host, prompt, gbnf_grammar)
 text = create_completion(prompt=prompt, grammar=gbnf_grammar)
    json_data = json.loads(text)
    # In this case, there's no function nor function_parameters.
    # Here the result will vary based on the LLM used.
    keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"])
    if keys != sorted(json_data.keys()):
        print(f"Unexpected result: {sorted(json_data.keys())}")
        return 1
    book = Book(**json_data)
    print(f"  As a Book object: %s" % book)
    return 0
 print(Book(**json_data))
 # An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
 def get_current_datetime(output_format: Optional[str] = None):
-    """
+    """Get the current date and time in the given format.
-    Get the current date and time in the given format.
+
    Args:
         output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
    """
-    if output_format is None:
+    return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S")
        output_format = '%Y-%m-%d %H:%M:%S'
    return datetime.datetime.now().strftime(output_format)
-# Example function to get the weather
+# Example function to get the weather.
 def get_current_weather(location, unit):
    """Get the current weather in a given location"""
    if "London" in location:
@ -160,11 +206,15 @@ def get_current_weather(location, unit):
        return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
    elif "North Pole" in location:
        return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
    else:
    return json.dumps({"location": location, "temperature": "unknown"})
-# Here is a function definition in OpenAI style
+def example_concurrent(host):
    """An example for parallel function calling with a Python function, a pydantic
    function model and an OpenAI like function definition.
    """
    print("- example_concurrent")
    # Function definition in OpenAI style.
    current_weather_tool = {
        "type": "function",
        "function": {
@ -183,45 +233,80 @@ current_weather_tool = {
            },
        },
    }
-
+    # Convert OpenAI function definition into pydantic model.
 # Convert OpenAI function definition into pydantic model
    current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
-# Add the actual function to a pydantic model
+    # Add the actual function to a pydantic model.
    current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
-# Convert normal Python function to a pydantic model
+    # Convert normal Python function to a pydantic model.
    current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
-tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
+    tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
-    pydantic_model_list=tool_list, outer_object_name="function",
+        pydantic_model_list=tools, outer_object_name="function",
        outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
    system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
    text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
-
+    text = create_completion(host, prompt, gbnf_grammar)
 text = create_completion(prompt=prompt, grammar=gbnf_grammar)
    json_data = json.loads(text)
-
+    expected = [
-print(json_data)
+      {
-# Should output something like this:
+        "function": "get_current_datetime",
-# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
+        "params": {
-
+          "output_format": "%Y-%m-%d %H:%M:%S"
-
+        }
      },
      {
        "function": "get_current_weather",
        "params": {
          "location": "London",
          "unit": "celsius"
        }
      },
      {
        "function": "Calculator",
        "params": {
          "number_one": 42,
          "operation": "multiply",
          "number_two": 42
        }
      }
    ]
    res = 0
    if json_data != expected:
        print("  Result is not as expected!")
        print("  This can happen on highly quantized models")
        res = 1
    tools_map = {tool.__name__:tool for tool in tools}
    for call in json_data:
-    if call["function"] == "Calculator":
+      tool = tools_map.get(call["function"])
-        print(Calculator(**call["params"]).run())
+      if not tool:
-    elif call["function"] == "get_current_datetime":
+          print(f"Error: unknown tool {call['function']}")
-        print(current_datetime_model(**call["params"]).run())  # pyright: ignore[reportAttributeAccessIssue]
+          return 1
-    elif call["function"] == "get_current_weather":
+      result = tool(**call["params"]).run()
-        print(current_weather_tool_model(**call["params"]).run())  # pyright: ignore[reportAttributeAccessIssue]
+      print(f"  Call {call['function']} returned {result}")
    # Should output something like this:
-# 2024-01-14 13:36:06
+    #   Call get_current_datetime returned 2024-07-15 09:50:38
-# {"location": "London", "temperature": "42", "unit": "celsius"}
+    #   Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"}
-# 1764
+    #   Call Calculator returned 1764
    return res
 def main():
    parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
    parser.add_argument("--host", default="localhost:8080", help="llama.cpp server")
    parser.add_argument("-v", "--verbose", action="store_true", help="enables logging")
    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR)
    ret = 0
    # Comment out below to only run the example you want.
    ret = ret or example_rce(args.host)
    ret = ret or example_calculator(args.host)
    ret = ret or example_struct(args.host)
    ret = ret or example_concurrent(args.host)
    return ret
 if __name__ == "__main__":
    sys.exit(main())
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -444,7 +444,7 @@ node index.js
    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
-    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
+    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
    By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1720768451,
+        "lastModified": 1721379653,
-        "narHash": "sha256-EYekUHJE2gxeo2pM/zM9Wlqw1Uw2XTJXOSAO79ksc4Y=",
+        "narHash": "sha256-8MUgifkJ7lkZs3u99UDZMB4kbOxvMEXQZ31FO3SopZ0=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "7e7c39ea35c5cdd002cd4588b03a3fb9ece6fad9",
+        "rev": "1d9c2c9b3e71b9ee663d11c5d298727dace8d374",
        "type": "github"
      },
      "original": {
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@ -59,6 +59,24 @@ void ggml_cuda_op_mul_mat_q(
        case GGML_TYPE_Q6_K:
            mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ2_XXS:
            mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ2_XS:
            mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ2_S:
            mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ3_XXS:
            mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ3_S:
            mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ1_S:
            mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
            break;
        case GGML_TYPE_IQ4_XS:
            mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
            break;
@ -93,6 +111,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ4_NL:
            mmq_supported = true;
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
--- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@ -23,7 +23,8 @@ SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}
 TYPES_MMQ = [
    "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
    "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
-    "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
+    "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
    "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
 ]
 SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
--- a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../mmq.cuh"
 DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
--- a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../mmq.cuh"
 DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
--- a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../mmq.cuh"
 DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
--- a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../mmq.cuh"
 DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
--- a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../mmq.cuh"
 DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
--- a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
@ -0,0 +1,5 @@
 // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 #include "../mmq.cuh"
 DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
--- a/ggml/src/ggml-cuda/vecdotq.cuh
+++ b/ggml/src/ggml-cuda/vecdotq.cuh
@ -188,6 +188,27 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
 }
 template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_impl(
    const int * v, const int * u, const float * d8_0, const float & d8_1) {
    float sumf = 0.0f;
 #pragma unroll
    for (int i0 = 0; i0 < vdr; i0 += QI8_0/2) {
        int sumi = 0;
 #pragma unroll
        for (int i = i0; i < i0 + QI8_0/2; ++i) {
            // SIMD dot product of quantized values
            sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
        }
        sumf += d8_0[i0/(QI8_0/2)]*sumi;
    }
    return d8_1*sumf;
 }
 #define VDR_Q2_K_Q8_1_MMVQ 1
 #define VDR_Q2_K_Q8_1_MMQ  4
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@ -1786,10 +1786,6 @@ static enum ggml_status ggml_metal_graph_compute(
                                    }
                            };
                            if (ggml_is_quantized(src0t)) {
                                GGML_ASSERT(ne00 >= nth0*nth1);
                            }
                            [encoder setComputePipelineState:pipeline];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@ -4757,7 +4757,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
        device const float4 * y4 = (device const float4 *)yb;
        yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
-        for (int row = 0; row < 2; ++row) {
+        for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
            device const block_iq4_nl & xb = x[row*nb + ib];
            device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
@ -4789,7 +4789,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
        yb += 16 * QK4_NL;
    }
-    for (int row = 0; row < 2; ++row) {
+    for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -19019,7 +19019,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
        FILE * fout = ggml_fopen(fname, "wb");
        if (!fout) {
-            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+            fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno));
            return;
        }
@ -19156,7 +19156,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
    {
        FILE * fin = ggml_fopen(fname, "rb");
        if (!fin) {
-            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+            fprintf(stderr, "%s: failed to open %s: %s\n", __func__, fname, strerror(errno));
            return result;
        }
@ -20830,6 +20830,7 @@ struct gguf_context * gguf_init_empty(void) {
 struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
    FILE * file = ggml_fopen(fname, "rb");
    if (!file) {
        fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
        return NULL;
    }
@ -21014,7 +21015,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            gguf_tensor_info_sanitize(info);
            // make sure there is no duplicated tensor names
-            for (uint64_t j = 0; j < i; ++j) {
+            for (uint64_t j = 0; j < i && ok; ++j) {
                if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
                    fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
                    ok = false;
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@ -54,6 +54,7 @@ class Metadata:
        model_card = Metadata.load_model_card(model_path)
        hf_params = Metadata.load_hf_parameters(model_path)
        # TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
        # heuristics
        metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
@ -62,6 +63,7 @@ class Metadata:
        # This is based on LLM_KV_NAMES mapping in llama.cpp
        metadata_override = Metadata.load_metadata_override(metadata_override_path)
        metadata.name            = metadata_override.get(Keys.General.NAME,            metadata.name)
        metadata.author          = metadata_override.get(Keys.General.AUTHOR,          metadata.author)
        metadata.version         = metadata_override.get(Keys.General.VERSION,         metadata.version)
        metadata.organization    = metadata_override.get(Keys.General.ORGANIZATION,    metadata.organization)
@ -176,6 +178,12 @@ class Metadata:
            org_component = None
        name_parts: list[str] = model_full_name_component.split('-')
        # Remove empty parts
        for i in reversed(range(len(name_parts))):
            if len(name_parts[i]) == 0:
                del name_parts[i]
        name_types: list[
            set[Literal["basename", "size_label", "finetune", "version", "type"]]
        ] = [set() for _ in name_parts]
@ -222,9 +230,19 @@ class Metadata:
                name_parts[i] = part
            # Some easy to recognize finetune names
            elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE):
                if total_params < 0 and part.lower() == "lora":
                    # ignore redundant "lora" in the finetune part when the output is a lora adapter
                    name_types[i].add("type")
                else:
                    name_types[i].add("finetune")
-                if part.lower() == "lora":
+
-                    name_parts[i] = "LoRA"
+        # Ignore word-based size labels when there is at least a number-based one present
        # TODO: should word-based size labels always be removed instead?
        if any(c.isdecimal() for n, t in zip(name_parts, name_types) if "size_label" in t for c in n):
            for n, t in zip(name_parts, name_types):
                if "size_label" in t:
                    if all(c.isalpha() for c in n):
                        t.remove("size_label")
        at_start = True
        # Find the basename through the annotated name
@ -239,18 +257,18 @@ class Metadata:
        # Remove the basename annotation from trailing version
        for part, t in zip(reversed(name_parts), reversed(name_types)):
-            if "basename" in t:
+            if "basename" in t and len(t) > 1:
                if len(t) > 1:
                t.remove("basename")
            else:
                break
        basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None
-        size_label = "-".join(s for s, t in zip(name_parts, name_types) if "size_label" in t) or None
+        # Deduplicate size labels using order-preserving 'dict' ('set' seems to sort the keys)
        size_label = "-".join(dict.fromkeys(s for s, t in zip(name_parts, name_types) if "size_label" in t).keys()) or None
        finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None
        # TODO: should the basename version always be excluded?
-        # TODO: should multiple versions be joined together?
+        # NOTE: multiple finetune versions are joined together
-        version = ([v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t] or [None])[-1]
+        version = "-".join(v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t) or None
        if size_label is None and finetune is None and version is None:
            # Too ambiguous, output nothing
--- a/gguf-py/gguf/utility.py
+++ b/gguf-py/gguf/utility.py
@ -50,15 +50,15 @@ def naming_convention(model_name: str | None, base_name: str | None, finetune_st
    # Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
    if base_name is not None:
-        name = base_name.strip().title().replace(' ', '-').replace('/', '-')
+        name = base_name.strip().replace(' ', '-').replace('/', '-')
    elif model_name is not None:
-        name = model_name.strip().title().replace(' ', '-').replace('/', '-')
+        name = model_name.strip().replace(' ', '-').replace('/', '-')
    else:
        name = "ggml-model"
    parameters = f"-{size_label}" if size_label is not None else ""
-    finetune = f"-{finetune_string.strip().title().replace(' ', '-')}" if finetune_string is not None else ""
+    finetune = f"-{finetune_string.strip().replace(' ', '-')}" if finetune_string is not None else ""
    version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else ""
--- a/gguf-py/scripts/gguf_dump.py
+++ b/gguf-py/scripts/gguf_dump.py
@ -4,6 +4,7 @@ from __future__ import annotations
 import logging
 import argparse
 import os
 import re
 import sys
 from pathlib import Path
 from typing import Any
@ -244,26 +245,58 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
        else:
            pretty_type = str(field.types[-1].name)
        def escape_markdown_inline_code(value_string):
            # Find the longest contiguous sequence of backticks in the string then
            # wrap string with appropriate number of backticks required to escape it
            max_backticks = max((len(match.group(0)) for match in re.finditer(r'`+', value_string)), default=0)
            inline_code_marker = '`' * (max_backticks + 1)
            # If the string starts or ends with a backtick, add a space at the beginning and end
            if value_string.startswith('`') or value_string.endswith('`'):
                value_string = f" {value_string} "
            return f"{inline_code_marker}{value_string}{inline_code_marker}"
        total_elements = len(field.data)
        value = ""
        if len(field.types) == 1:
            curr_type = field.types[0]
            if curr_type == GGUFValueType.STRING:
-                value = repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60])
+                truncate_length = 60
                value_string = str(bytes(field.parts[-1]), encoding='utf-8')
                if len(value_string) > truncate_length:
                    head = escape_markdown_inline_code(value_string[:truncate_length // 2])
                    tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
                    value = "{head}...{tail}".format(head=head, tail=tail)
                else:
                    value = escape_markdown_inline_code(value_string)
            elif curr_type in reader.gguf_scalar_to_np:
                value = str(field.parts[-1][0])
        else:
            if field.types[0] == GGUFValueType.ARRAY:
                curr_type = field.types[1]
                array_elements = []
                if curr_type == GGUFValueType.STRING:
                    render_element = min(5, total_elements)
                    for element_pos in range(render_element):
-                        value += repr(str(bytes(field.parts[-1 - element_pos]), encoding='utf-8')[:5]) + (", " if total_elements > 1 else "")
+                        truncate_length = 30
                        value_string = str(bytes(field.parts[-1 - (total_elements - element_pos - 1) * 2]), encoding='utf-8')
                        if len(value_string) > truncate_length:
                            head = escape_markdown_inline_code(value_string[:truncate_length // 2])
                            tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
                            value = "{head}...{tail}".format(head=head, tail=tail)
                        else:
                            value = escape_markdown_inline_code(value_string)
                        array_elements.append(value)
                elif curr_type in reader.gguf_scalar_to_np:
                    render_element = min(7, total_elements)
                    for element_pos in range(render_element):
-                        value += str(field.parts[-1 - element_pos][0]) + (", " if total_elements > 1 else "")
+                        array_elements.append(str(field.parts[-1 - (total_elements - element_pos - 1)][0]))
-                value = f'[ {value}{" ..." if total_elements > 1 else ""} ]'
+
                value = f'[ {", ".join(array_elements).strip()}{", ..." if total_elements > len(array_elements) else ""} ]'
        kv_dump_table.append({"n":n, "pretty_type":pretty_type, "total_elements":total_elements, "field_name":field.name, "value":value})
    kv_dump_table_header_map = [
--- a/gguf-py/tests/test_metadata.py
+++ b/gguf-py/tests/test_metadata.py
@ -54,7 +54,7 @@ class TestMetadataMethod(unittest.TestCase):
        self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Meta-Llama-3-8B"),
                         ('Meta-Llama-3-8B', "NousResearch", 'Meta-Llama-3', None, None, '8B'))
-        # Can't detect all non standard form in a heuristically safe way... best to err in caution and output nothing...
+        # Non standard naming
        self.assertEqual(gguf.Metadata.get_model_id_components("Qwen1.5-MoE-A2.7B-Chat"),
                         ('Qwen1.5-MoE-A2.7B-Chat', None, 'Qwen1.5-MoE', 'Chat', None, 'A2.7B'))
@ -71,7 +71,7 @@ class TestMetadataMethod(unittest.TestCase):
        self.assertEqual(gguf.Metadata.get_model_id_components("delphi-suite/stories-llama2-50k", 50 * 10**3),
                         ('stories-llama2-50k', 'delphi-suite', 'stories-llama2', None, None, '50K'))
-        # None standard and not easy to disambiguate
+        # Non standard and not easy to disambiguate
        self.assertEqual(gguf.Metadata.get_model_id_components("DeepSeek-Coder-V2-Lite-Instruct"),
                         ('DeepSeek-Coder-V2-Lite-Instruct', None, 'DeepSeek-Coder-V2-Lite', 'Instruct', None, None))
@ -123,6 +123,51 @@ class TestMetadataMethod(unittest.TestCase):
        self.assertEqual(gguf.Metadata.get_model_id_components("bigscience/bloom-7b1-petals"),
                         ('bloom-7b1-petals', 'bigscience', 'bloom', 'petals', None, '7.1B'))
        # Ignore full-text size labels when there are number-based ones, and deduplicate size labels
        self.assertEqual(gguf.Metadata.get_model_id_components("MaziyarPanahi/GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1"),
                         ('GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1', 'MaziyarPanahi', 'GreenNode-mini', 'multilingual-v1olet-Mistral-Instruct', 'v0.1', '7B'))
        # Instruct in a name without a size label
        self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/Mistral-Nemo-Instruct-2407"),
                         ('Mistral-Nemo-Instruct-2407', 'mistralai', 'Mistral-Nemo', 'Instruct', '2407', None))
        # Non-obvious splitting relying on 'chat' keyword
        self.assertEqual(gguf.Metadata.get_model_id_components("deepseek-ai/DeepSeek-V2-Chat-0628"),
                         ('DeepSeek-V2-Chat-0628', 'deepseek-ai', 'DeepSeek-V2', 'Chat', '0628', None))
        # Multiple versions
        self.assertEqual(gguf.Metadata.get_model_id_components("OpenGVLab/Mini-InternVL-Chat-2B-V1-5"),
                         ('Mini-InternVL-Chat-2B-V1-5', 'OpenGVLab', 'Mini-InternVL', 'Chat', 'V1-5', '2B'))
        # TODO: DPO in the name
        self.assertEqual(gguf.Metadata.get_model_id_components("jondurbin/bagel-dpo-2.8b-v0.2"),
                         ('bagel-dpo-2.8b-v0.2', 'jondurbin', 'bagel-dpo', None, 'v0.2', '2.8B'))
        # DPO in name, but can't be used for the finetune to keep 'LLaMA-3' in the basename
        self.assertEqual(gguf.Metadata.get_model_id_components("voxmenthe/SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized"),
                         ('SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized', 'voxmenthe', 'SFR-Iterative-DPO-LLaMA-3', 'R-unquantized', None, '8B'))
        # Too ambiguous
        # TODO: should "base" be a 'finetune' or 'size_label'?
        # (in this case it should be a size label, but other models use it to signal that they are not finetuned)
        self.assertEqual(gguf.Metadata.get_model_id_components("microsoft/Florence-2-base"),
                         ('Florence-2-base', 'microsoft', None, None, None, None))
        ## Invalid cases ##
        # Start with a dash and has dashes in rows
        self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/-Mistral--Nemo-Base-2407-"),
                         ('-Mistral--Nemo-Base-2407-', 'mistralai', 'Mistral-Nemo-Base', None, '2407', None))
        ## LoRA ##
        self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B"),
                         ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration-LoRA', None, '8B'))
        # Negative size --> output is a LoRA adaper --> prune "LoRA" out of the name to avoid redundancy with the suffix
        self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B", -1234),
                         ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration', None, '8B'))
    def test_apply_metadata_heuristic_from_model_card(self):
        model_card = {
            'tags': ['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'],
@ -134,7 +179,7 @@ class TestMetadataMethod(unittest.TestCase):
        }
        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
        expect = gguf.Metadata()
-        expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': 'v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
+        expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': '14-v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
        expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl']
        expect.languages=['en']
        expect.datasets=['teknium/OpenHermes-2.5']
--- a/include/llama.h
+++ b/include/llama.h
@ -40,7 +40,7 @@
 #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 6
+#define LLAMA_SESSION_VERSION 7
 #define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
 #define LLAMA_STATE_SEQ_VERSION 1
@ -92,6 +92,9 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
        LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
        LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
        LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
        LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
    };
    // note: these values should be synchronized with ggml_rope
--- a/models/ggml-vocab-gpt2.gguf
+++ b/models/ggml-vocab-gpt2.gguf
--- a/models/ggml-vocab-stablelm.gguf
+++ b/models/ggml-vocab-stablelm.gguf
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -114,7 +114,7 @@
 // bump if necessary
 #define LLAMA_MAX_NODES   8192
-#define LLAMA_MAX_LAYERS  256
+#define LLAMA_MAX_LAYERS  512
 #define LLAMA_MAX_EXPERTS 160  // DeepSeekV2
 //
@ -3875,7 +3875,7 @@ struct llama_model_loader {
            ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
            {
-                const int kid = gguf_find_key(meta, "general.file_type");
+                const int kid = gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
                if (kid >= 0) {
                    ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
                }
@ -4007,7 +4007,9 @@ struct llama_model_loader {
                throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
        }
-        GGML_ASSERT(arr_info.length <= N_MAX);
+        if (arr_info.length > N_MAX) {
            throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
        }
        std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
@ -4043,8 +4045,6 @@ struct llama_model_loader {
    // get array of n <= N_MAX elements, or a single element repeated n times
    template<typename T, size_t N_MAX>
    bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, const bool required = true) {
        GGML_ASSERT(n <= N_MAX);
        const int kid = gguf_find_key(meta, key.c_str());
        if (kid < 0) {
@ -4054,6 +4054,10 @@ struct llama_model_loader {
            return false;
        }
        if (n > N_MAX) {
            throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
        }
        if (gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) {
            struct GGUFMeta::ArrayInfo arr_info =
                GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
@ -5003,7 +5007,7 @@ static void llm_load_hparams(
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                switch (hparams.n_layer) {
-                    case 42: model.type = e_model::MODEL_SMALL; break;
+                    case 42: model.type = e_model::MODEL_7B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
@ -5365,6 +5369,7 @@ static void llm_load_vocab(
            if (merges_keyidx == -1) {
                throw std::runtime_error("cannot find tokenizer merges in model file\n");
            }
            const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
            for (int i = 0; i < n_merges; i++) {
                const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
@ -5403,16 +5408,6 @@ static void llm_load_vocab(
            vocab.special_cls_id  = -1;
            vocab.special_mask_id = -1;
            const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
            if (add_space_prefix_keyidx != -1) {
                vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
            } // The default value of add_space_prefix is true.
            const int remove_extra_whitespaces_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
            if (remove_extra_whitespaces_keyidx != -1) {
                vocab.tokenizer_remove_extra_whitespaces = gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
            } // The default value of remove_extra_whitespaces is false.
            const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
            if (precompiled_charsmap_keyidx != -1) {
                size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
@ -5520,6 +5515,19 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "jais") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
            } else if (
                tokenizer_pre == "tekken") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
                vocab.tokenizer_clean_spaces = false;
                vocab.tokenizer_ignore_merges = true;
                vocab.tokenizer_add_bos = true;
            } else if (
                tokenizer_pre == "smollm") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
                vocab.tokenizer_clean_spaces = false;
            } else if (
                tokenizer_pre == "codeshell") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@ -5543,10 +5551,8 @@ static void llm_load_vocab(
            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
        }
-        const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
+        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,      vocab.tokenizer_add_space_prefix,         false);
-        if (add_space_prefix_keyidx != -1) {
+        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
            vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
        }
    }
    const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
@ -6127,10 +6133,10 @@ static bool llm_load_tensors(
                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
                        // optional bias tensors
                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
@ -15544,6 +15550,8 @@ struct llm_tokenizer_bpe {
            case LLAMA_VOCAB_PRE_TYPE_STARCODER:
            case LLAMA_VOCAB_PRE_TYPE_REFACT:
            case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
            case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
            case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
                regex_exprs = {
                    "\\p{N}",
                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@ -15581,6 +15589,13 @@ struct llm_tokenizer_bpe {
                    "\\p{N}",
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
                    // original regex from tokenizer.json
                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
                regex_exprs = {
                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@ -18271,8 +18286,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    // copy the KV pairs from the input file
    gguf_set_kv     (ctx_out, ml.meta);
-    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
+    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
-    gguf_set_val_u32(ctx_out, "general.file_type", ftype);
+    gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
    // Remove split metadata
    gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
    gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
@ -19435,7 +19451,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_BAICHUAN:
        case LLM_ARCH_STARCODER:
        case LLM_ARCH_PLAMO:
        case LLM_ARCH_CODESHELL:
        case LLM_ARCH_ORION:
        case LLM_ARCH_INTERNLM2:
        case LLM_ARCH_MINICPM:
@ -19465,6 +19480,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_STARCODER2:
        case LLM_ARCH_OPENELM:
        case LLM_ARCH_GPTNEOX:
        case LLM_ARCH_CODESHELL:
            return LLAMA_ROPE_TYPE_NEOX;
        // all model arches should be listed explicitly here
@ -19920,7 +19936,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
    );
    // on session change it is very likely that the state size has changed - so we need to update this function
-    static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
+    static_assert(LLAMA_SESSION_VERSION == 7, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
    return s_total;
 }
@ -21607,7 +21623,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|assistant|>";
        }
-    } else if (tmpl == "chaglm4" || tmpl_contains("[gMASK]<sop>")) {
+    } else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
        ss << "[gMASK]" << "<sop>";
        for (auto message : chat) {
            std::string role(message->role);
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -70,21 +70,19 @@ add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
 target_link_libraries(test-tokenizer-0 PRIVATE common)
 install(TARGETS test-tokenizer-0 RUNTIME)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
 # TODO: enable when fixed
 #       https://github.com/ggerganov/llama.cpp/pull/7036
 #llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
 #llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
 #llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 # build test-tokenizer-1-bpe target once and add many tests
 add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
@ -92,16 +90,14 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
 install(TARGETS test-tokenizer-1-bpe RUNTIME)
 # TODO: disabled due to slowness
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf)
 # build test-tokenizer-1-spm target once and add many tests
 add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -79,8 +79,16 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
                im = nullptr;
            }
        }
        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
        GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
        // TODO: other cases
        //#pragma omp parallel for
        //for (int i = 0; i < tensor->ne[1]; i++) {
        //    ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
        //        i * tensor->ne[0], 1, tensor->ne[0], im);
        //}
        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
    } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
        // This is going to create some weird integers though.
@ -2220,6 +2228,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
    }
 #if 1
    for (ggml_type type_a : base_types) {
        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1,  1}, {1, 1}));
@ -2239,6 +2248,24 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
        }
    }
 #else
    // m = a rows
    // n = b rows
    // k = cols
    std::uniform_int_distribution<> dist_m(1, 128);
    std::uniform_int_distribution<> dist_n(16, 128);
    std::uniform_int_distribution<> dist_k(1, 16);
    for (int i = 0; i < 1000; i++) {
        for (ggml_type type_a : all_types) {
            for (ggml_type type_b : {GGML_TYPE_F32}) {
                int m = dist_m(rng);
                int n = dist_n(rng);
                int k = dist_k(rng) * ggml_blck_size(type_a);
                test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1,  1}, {1, 1}));
            }
        }
    }
 #endif
    for (ggml_type type_a : other_types) {
        for (ggml_type type_b : {GGML_TYPE_F32}) {