Merge branch 'master' into xsn/clean_up_server

2024-01-22 21:41:01 +01:00 · 2024-01-22 21:41:01 +01:00 · 58fe9cf572
commit 58fe9cf572
parent 906afe7810 6f9939d119
34 changed files with 2271 additions and 701 deletions
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@ -7,6 +7,18 @@
    { system, ... }:
    {
      _module.args = {
        # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
        # again, the below creates several nixpkgs instances which the
        # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
        #
        # This is currently "slow" and "expensive", on a certain scale.
        # This also isn't "right" in that this hinders dependency injection at
        # the level of flake inputs. This might get removed in the foreseeable
        # future.
        #
        # Note that you can use these expressions without Nix
        # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
        pkgsCuda = import inputs.nixpkgs {
          inherit system;
          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -73,6 +73,7 @@ let
    ps: [
      ps.numpy
      ps.sentencepiece
      ps.tiktoken
      ps.torchWithoutCuda
      ps.transformers
    ]
@ -114,14 +115,22 @@ effectiveStdenv.mkDerivation (
    pname = "llama-cpp${pnameSuffix}";
    version = llamaVersion;
    # Note: none of the files discarded here are visible in the sandbox or
    # affect the output hash. This also means they can be modified without
    # triggering a rebuild.
    src = lib.cleanSourceWith {
      filter =
        name: type:
-        !(builtins.any (_: _) [
+        let
          noneOf = builtins.all (x: !x);
          baseName = baseNameOf name;
        in
        noneOf [
          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-          (name == "README.md") # Ignore *.md changes whe computing outPaths
+          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
-          (lib.hasPrefix "." name) # Skip hidden files and directories
+          (lib.hasPrefix "." baseName) # Skip hidden files and directories
-        ]);
+          (baseName == "flake.lock")
        ];
      src = lib.cleanSource ../../.;
    };
@ -159,7 +168,7 @@ effectiveStdenv.mkDerivation (
    cmakeFlags =
      [
-        (cmakeBool "LLAMA_NATIVE" true)
+        (cmakeBool "LLAMA_NATIVE" false)
        (cmakeBool "LLAMA_BUILD_SERVER" true)
        (cmakeBool "BUILD_SHARED_LIBS" true)
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@ -4,6 +4,10 @@
  llamaVersion ? "0.0.0",
 }:
 # We're using `makeScope` instead of just writing out an attrset
 # because it allows users to apply overlays later using `overrideScope'`.
 # Cf. https://noogle.dev/f/lib/makeScope
 lib.makeScope newScope (
  self: {
    inherit llamaVersion;
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -295,7 +295,7 @@ jobs:
      OPENBLAS_VERSION: 0.3.23
      OPENCL_VERSION: 2023.04.17
      CLBLAST_VERSION: 1.6.0
-      SDE_VERSION: 9.21.1-2023-04-24
+      SDE_VERSION: 9.33.0-2024-01-07
    strategy:
      matrix:
@ -400,7 +400,7 @@ jobs:
        id: cmake_test_sde
        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
        run: |
-          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
+          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
          # for some weird reason windows tar doesn't like sde tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@ -2,13 +2,20 @@ name: Nix aarch64 builds
 on:
  workflow_dispatch: # allows manual triggering
  schedule:
    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
    # 1.5h instead of minutes with the cold cache).
    #
    # randint(0, 59), randint(0, 23)
    - cron: '26 12 * * *'
  # But also rebuild if we touched any of the Nix expressions:
  push:
    branches:
      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+    paths: ['**/*.nix', 'flake.lock']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+    paths: ['**/*.nix', 'flake.lock']
 jobs:
  nix-build-aarch64:
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@ -5,10 +5,8 @@ on:
  push:
    branches:
      - master
    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
 jobs:
  nix-eval:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -47,6 +47,7 @@ option(BUILD_SHARED_LIBS                "build shared libraries"
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
 option(LLAMA_CCACHE                     "llama: use ccache if available"                        ON)
 # debug
 option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
@ -107,6 +108,13 @@ option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STA
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
 # add perf arguments
 option(LLAMA_PERF                            "llama: enable perf"                               OFF)
 if (LLAMA_PERF)
    add_definitions(-DGGML_PERF)
 endif()
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
@ -561,6 +569,17 @@ if (LLAMA_LTO)
    endif()
 endif()
 if (LLAMA_CCACHE)
    find_program(LLAMA_CCACHE_FOUND ccache)
    if (LLAMA_CCACHE_FOUND)
        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
        set(ENV{CCACHE_SLOPPINESS} time_macros)
        message(STATUS "Using ccache")
    else()
        message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF")
    endif ()
 endif()
 # this version of Apple ld64 is buggy
 execute_process(
    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
--- a/README.md
+++ b/README.md
@ -128,6 +128,7 @@ as the main playground for developing new features for the [ggml](https://github
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 **UI:**
--- a/common/common.cpp
+++ b/common/common.cpp
@ -203,6 +203,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.prompt_cache_all = true;
        } else if (arg == "--prompt-cache-ro") {
            params.prompt_cache_ro = true;
        } else if (arg == "-bf" || arg == "--binary-file") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            std::ifstream file(argv[i], std::ios::binary);
            if (!file) {
                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
                invalid_param = true;
                break;
            }
            // store the external file name in params
            params.prompt_file = argv[i];
            file.seekg(0, std::ios::end);
            size_t size = file.tellg();
            file.seekg(0, std::ios::beg);
            params.prompt.resize(size);
            file.read((char *)params.prompt.data(), size);
            fprintf(stderr, "Read %zu bytes from binary file %s\n", size, argv[i]);
        } else if (arg == "-f" || arg == "--file") {
            if (++i >= argc) {
                invalid_param = true;
@ -653,6 +672,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                params.logdir += DIRECTORY_SEPARATOR;
            }
        } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.logits_file = argv[i];
        } else if (arg == "--perplexity" || arg == "--all-logits") {
            params.logits_all = true;
        } else if (arg == "--ppl-stride") {
@ -689,6 +714,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.winogrande_tasks = std::stoi(argv[i]);
        } else if (arg == "--multiple-choice") {
            params.multiple_choice = true;
        } else if (arg == "--multiple-choice-tasks") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.multiple_choice_tasks = std::stoi(argv[i]);
        } else if (arg == "--kl-divergence") {
            params.kl_divergence = true;
        } else if (arg == "--ignore-eos") {
            params.ignore_eos = true;
        } else if (arg == "--no-penalize-nl") {
@ -888,6 +923,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    printf("  -f FNAME, --file FNAME\n");
    printf("                        prompt file to start generation.\n");
    printf("  -bf FNAME, --binary-file FNAME\n");
    printf("                        binary file containing multiple choice tasks.\n");
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@ -936,6 +973,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
    printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
    printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
    printf("  --multiple-choice     compute multiple choice score over random tasks from datafile supplied with -f\n");
    printf("  --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
    printf("  --kl-divergence       computes KL-divergence to logits provided via --kl-divergence-base");
    printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
--- a/common/common.h
+++ b/common/common.h
@ -91,6 +91,7 @@ struct gpt_params {
    std::string input_suffix      = "";  // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files
    std::string logits_file       = "";  // file for saving *all* logits
    std::vector<llama_model_kv_override> kv_overrides;
@ -108,6 +109,11 @@ struct gpt_params {
    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
    bool   kl_divergence   = false; // compute KL-divergence
    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -10,7 +10,7 @@ import re
 import sys
 from enum import IntEnum
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
+from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
 import numpy as np
 import torch
@ -289,6 +289,58 @@ class Model:
        special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
        special_vocab.add_to_gguf(self.gguf_writer)
    def _set_vocab_qwen(self):
        dir_model = self.dir_model
        hparams = self.hparams
        tokens: list[bytearray] = []
        toktypes: list[int] = []
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
        vocab_size = hparams["vocab_size"]
        assert max(tokenizer.get_vocab().values()) < vocab_size
        merges = []
        vocab = {}
        mergeable_ranks = tokenizer.mergeable_ranks
        for token, rank in mergeable_ranks.items():
            vocab[QwenModel.token_bytes_to_string(token)] = rank
            if len(token) == 1:
                continue
            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
            assert len(merged) == 2
            merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
        added_vocab = tokenizer.special_tokens
        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
        for i in range(vocab_size):
            if i not in reverse_vocab:
                pad_token = f"[PAD{i}]".encode("utf-8")
                tokens.append(bytearray(pad_token))
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.CONTROL)
            else:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.NORMAL)
        self.gguf_writer.add_tokenizer_model("gpt2")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
        special_vocab.merges = merges
        # only add special tokens when they were not already loaded from config.json
        if len(special_vocab.special_token_ids) == 0:
            special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
            special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
        # this one is usually not in config.json anyway
        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
        special_vocab.add_to_gguf(self.gguf_writer)
    def _set_vocab_sentencepiece(self):
        from sentencepiece import SentencePieceProcessor
@ -487,7 +539,8 @@ class MPTModel(Model):
            # map tensor names
            if "scales" in name:
                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
-                new_name = new_name.replace("scales", "act.scales")
+                if new_name is not None:
                    new_name = new_name.replace("scales", "act.scales")
            else:
                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
@ -876,6 +929,13 @@ class PersimmonModel(Model):
 class StableLMModel(Model):
    def set_vocab(self):
        if (self.dir_model / "tokenizer.json").is_file():
            self._set_vocab_gpt2()
        else:
            # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
            self._set_vocab_qwen()
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
@ -904,7 +964,7 @@ class QwenModel(Model):
        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
    @staticmethod
-    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
+    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
        parts = [bytes([b]) for b in token]
        while True:
            min_idx = None
@ -921,52 +981,7 @@ class QwenModel(Model):
        return parts
    def set_vocab(self):
-        dir_model = self.dir_model
+        self._set_vocab_qwen()
        hparams = self.hparams
        tokens: list[bytearray] = []
        toktypes: list[int] = []
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
        vocab_size = hparams["vocab_size"]
        assert max(tokenizer.get_vocab().values()) < vocab_size
        merges = []
        vocab = {}
        mergeable_ranks = tokenizer.mergeable_ranks
        for token, rank in mergeable_ranks.items():
            vocab[self.token_bytes_to_string(token)] = rank
            if len(token) == 1:
                continue
            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
            assert len(merged) == 2
            merges.append(' '.join(map(self.token_bytes_to_string, merged)))
        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
        added_vocab = tokenizer.special_tokens
        for i in range(vocab_size):
            if i not in reverse_vocab:
                pad_token = f"[PAD{i}]".encode("utf-8")
                tokens.append(bytearray(pad_token))
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.CONTROL)
            else:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.NORMAL)
        self.gguf_writer.add_tokenizer_model("gpt2")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
        special_vocab.merges = merges
        special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
        special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
        special_vocab.add_to_gguf(self.gguf_writer)
    def set_gguf_parameters(self):
        self.gguf_writer.add_name("Qwen")
@ -1285,7 +1300,7 @@ def main() -> None:
    if args.awq_path:
        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-        from awq.apply_awq import add_scale_weights
+        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
        tmp_model_path = args.model / "weighted_model"
        dir_model = tmp_model_path
        if tmp_model_path.is_dir():
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@ -2,6 +2,7 @@
 from __future__ import annotations
 import argparse
 import os
 import struct
 import sys
 from enum import IntEnum
@ -9,7 +10,6 @@ from pathlib import Path
 import numpy as np
 import os
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@ -371,15 +371,11 @@ def handle_metadata(cfg, hp):
        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
    else:
        raise ValueError('Unable to load metadata')
-    vocab = convert.load_vocab(
+    vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
-        cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
+    vocab_factory = convert.VocabFactory(vocab_path)
-        cfg.vocabtype)
+    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
    # FIXME: Respect cfg.vocab_dir?
    svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
                               load_merges = cfg.vocabtype == 'bpe',
                               n_vocab = vocab.vocab_size)
    convert.check_vocab_size(params, vocab)
-    return (params, vocab, svocab)
+    return params, vocab, special_vocab
 def handle_args():
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@ -5,17 +5,16 @@ import json
 import os
 import struct
 import sys
 from pathlib import Path
 from typing import Any, BinaryIO, Sequence
 import numpy as np
 import torch
 from pathlib import Path
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
@ -60,7 +59,14 @@ if __name__ == '__main__':
    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
-    model = torch.load(input_model, map_location="cpu")
+    if os.path.exists(input_model):
        model = torch.load(input_model, map_location="cpu")
    else:
        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
        # lazy import load_file only if lora is in safetensors format.
        from safetensors.torch import load_file
        model = load_file(input_model, device="cpu")
    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@ -1,11 +1,13 @@
 #!/usr/bin/env python3
 import torch
 import os
 from pprint import pprint
 import sys
 import argparse
 import os
 import sys
 from pathlib import Path
 from pprint import pprint
 import torch
 from sentencepiece import SentencePieceProcessor
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@ -69,7 +71,7 @@ def main():
    persimmon_model = torch.load(args.ckpt_path)
    hparams = persimmon_model['args']
    pprint(hparams)
-    tensors = {}
+    tensors: dict[str, torch.Tensor] = {}
    _flatten_dict(persimmon_model['model'], tensors, None)
    arch = gguf.MODEL_ARCH.PERSIMMON
--- a/convert.py
+++ b/convert.py
@ -17,58 +17,28 @@ import signal
 import struct
 import sys
 import time
 import warnings
 import zipfile
 from abc import ABCMeta, abstractmethod
 from argparse import ArgumentParser
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import (
+from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
    IO,
    TYPE_CHECKING,
    Any,
    Callable,
    Iterable,
    Literal,
    Optional,
    Tuple,
    TypeVar,
 )
 import numpy as np
 from sentencepiece import SentencePieceProcessor
-try:
+if 'NO_LOCAL_GGUF' not in os.environ:
-    from transformers import AutoTokenizer
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-except ModuleNotFoundError as e:
+import gguf
    warnings.warn(f"Could not import AutoTokenizer from transformers: {e}")
-# If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory
+if TYPE_CHECKING:
-if "NO_LOCAL_GGUF" not in os.environ:
+    from typing import TypeAlias
    # Use absolute path to the gguf-py directory
    gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py")
    print(gguf_py_dir)  # NOTE: Remove this once path is verified after changes are completed
    if gguf_py_dir not in sys.path:
        sys.path.insert(1, gguf_py_dir)
-# Import gguf module
+if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
 try:
    import gguf
 except ModuleNotFoundError as e:
    print(f"Could not import gguf: {e}")
    sys.exit(1)
 if TYPE_CHECKING:  # NOTE: This isn't necessary.
    from typing import TypeAlias  # This can technically be omitted.
 if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"):
    faulthandler.register(signal.SIGUSR1)
-# NOTE: n-dimensional arrays should be directly referenced
+NDArray: TypeAlias = 'np.ndarray[Any, Any]'
 NDArray: TypeAlias = "np.ndarray[Any, Any]"
 # Why is this here? LLAMA and GPT are technically the only compatible ARCHs.
 ARCH = gguf.MODEL_ARCH.LLAMA
 DEFAULT_CONCURRENCY = 8
@ -78,7 +48,6 @@ DEFAULT_CONCURRENCY = 8
 #
 # TODO: Clean up and refactor data types
@dataclass(frozen=True)
 class DataType:
    name: str
@ -183,85 +152,65 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
@dataclass
 class Params:
-    n_vocab: int
+    n_vocab:        int
-    n_embd: int
+    n_embd:         int
-    n_layer: int
+    n_layer:        int
-    n_ctx: int
+    n_ctx:          int
-    n_ff: int
+    n_ff:           int
-    n_head: int
+    n_head:         int
-    n_head_kv: int
+    n_head_kv:      int
-    f_norm_eps: Optional[float] = None
+    n_experts:      int | None = None
-    n_experts: Optional[int] = None
+    n_experts_used: int | None = None
-    n_experts_used: Optional[int] = None
+    f_norm_eps:     float | None = None
-    rope_scaling_type: Optional[gguf.RopeScalingType] = None
+    rope_scaling_type: gguf.RopeScalingType | None = None
-    f_rope_freq_base: Optional[float] = None
+    f_rope_freq_base: float | None = None
-    f_rope_scale: Optional[float] = None
+    f_rope_scale: float | None = None
-    n_orig_ctx: Optional[int] = None
+    n_orig_ctx: int | None = None
-    rope_finetuned: Optional[bool] = None
+    rope_finetuned: bool | None = None
-    ftype: Optional[GGMLFileType] = None
+    ftype: GGMLFileType | None = None
    # path to the directory containing the model files
-    path_model: Optional[Path] = None
+    path_model: Path | None = None
    @staticmethod
-    def guessed(model: LazyModel) -> "Params":
+    def guessed(model: LazyModel) -> Params:
        # try transformer naming first
-        n_vocab, n_embd = (
+        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
            model["model.embed_tokens.weight"].shape
            if "model.embed_tokens.weight" in model
            else model["tok_embeddings.weight"].shape
        )
        # try transformer naming first
        if "model.layers.0.self_attn.q_proj.weight" in model:
-            n_layer = next(
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
-                i
+        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
-                for i in itertools.count()
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
                if f"model.layers.{i}.self_attn.q_proj.weight" not in model
            )
        elif (
            "model.layers.0.self_attn.W_pack.weight" in model
        ):  # next: try baichuan naming
            n_layer = next(
                i
                for i in itertools.count()
                if f"model.layers.{i}.self_attn.W_pack.weight" not in model
            )
        else:
-            n_layer = next(
+            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
                i
                for i in itertools.count()
                if f"layers.{i}.attention.wq.weight" not in model
            )
        if n_layer < 1:
-            raise Exception(
+            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
-                "failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
            )
-        n_head = n_embd // 128  # guessed
+        n_head = n_embd // 128 # guessed
-        n_mult = 256  # guessed
+        n_mult = 256           # guessed
        # TODO: verify this
        n_ff = int(2 * (4 * n_embd) / 3)
        n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
        return Params(
-            n_vocab=n_vocab,
+            n_vocab    = n_vocab,
-            n_embd=n_embd,
+            n_embd     = n_embd,
-            n_layer=n_layer,
+            n_layer    = n_layer,
-            n_ctx=-1,
+            n_ctx      = -1,
-            n_ff=n_ff,
+            n_ff       = n_ff,
-            n_head=n_head,
+            n_head     = n_head,
-            n_head_kv=n_head,
+            n_head_kv  = n_head,
-            f_norm_eps=1e-5,
+            f_norm_eps = 1e-5,
        )
    @staticmethod
-    def load_transformers_config(model: LazyModel, config_path: Path) -> "Params":
+    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))
        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
@ -274,22 +223,20 @@ class Params:
                rope_scaling_type = gguf.RopeScalingType.LINEAR
            elif typ == "yarn":
                rope_scaling_type = gguf.RopeScalingType.YARN
-                n_orig_ctx = rope_scaling["original_max_position_embeddings"]
+                n_orig_ctx = rope_scaling['original_max_position_embeddings']
-                rope_finetuned = rope_scaling["finetuned"]
+                rope_finetuned = rope_scaling['finetuned']
            else:
-                raise NotImplementedError(f"Unknown rope scaling type: {typ}")
+                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
        if "max_sequence_length" in config:
            n_ctx = config["max_sequence_length"]
        elif "max_position_embeddings" in config:
            n_ctx = config["max_position_embeddings"]
        else:
-            raise Exception(
+            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
-                "failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
            )
-        n_experts = None
+        n_experts      = None
        n_experts_used = None
        if "num_local_experts" in config:
@ -297,30 +244,30 @@ class Params:
            n_experts_used = config["num_experts_per_tok"]
        return Params(
-            n_vocab=config["vocab_size"],
+            n_vocab           = config["vocab_size"],
-            n_embd=config["hidden_size"],
+            n_embd            = config["hidden_size"],
-            n_layer=config["num_hidden_layers"],
+            n_layer           = config["num_hidden_layers"],
-            n_ctx=n_ctx,
+            n_ctx             = n_ctx,
-            n_ff=config["intermediate_size"],
+            n_ff              = config["intermediate_size"],
-            n_head=(n_head := config["num_attention_heads"]),
+            n_head            = (n_head := config["num_attention_heads"]),
-            n_head_kv=config.get("num_key_value_heads", n_head),
+            n_head_kv         = config.get("num_key_value_heads", n_head),
-            n_experts=n_experts,
+            n_experts         = n_experts,
-            n_experts_used=n_experts_used,
+            n_experts_used    = n_experts_used,
-            f_norm_eps=config["rms_norm_eps"],
+            f_norm_eps        = config["rms_norm_eps"],
-            f_rope_freq_base=config.get("rope_theta"),
+            f_rope_freq_base  = config.get("rope_theta"),
-            rope_scaling_type=rope_scaling_type,
+            rope_scaling_type = rope_scaling_type,
-            f_rope_scale=f_rope_scale,
+            f_rope_scale      = f_rope_scale,
-            n_orig_ctx=n_orig_ctx,
+            n_orig_ctx        = n_orig_ctx,
-            rope_finetuned=rope_finetuned,
+            rope_finetuned    = rope_finetuned,
        )
    # LLaMA v2 70B params.json
    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
    @staticmethod
-    def load_torch_params(model: LazyModel, config_path: Path) -> "Params":
+    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))
-        n_experts = None
+        n_experts      = None
        n_experts_used = None
        f_rope_freq_base = None
@ -343,50 +290,50 @@ class Params:
        if config.get("moe"):
            n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
-            n_experts = config["moe"]["num_experts"]
+            n_experts      = config["moe"]["num_experts"]
            n_experts_used = config["moe"]["num_experts_per_tok"]
            f_rope_freq_base = 1e6
        return Params(
-            n_vocab=model["tok_embeddings.weight"].shape[0],
+            n_vocab          = model["tok_embeddings.weight"].shape[0],
-            n_embd=config["dim"],
+            n_embd           = config["dim"],
-            n_layer=config["n_layers"],
+            n_layer          = config["n_layers"],
-            n_ctx=n_ctx,
+            n_ctx            = n_ctx,
-            n_ff=n_ff,
+            n_ff             = n_ff,
-            n_head=(n_head := config["n_heads"]),
+            n_head           = (n_head := config["n_heads"]),
-            n_head_kv=config.get("n_kv_heads", n_head),
+            n_head_kv        = config.get("n_kv_heads", n_head),
-            n_experts=n_experts,
+            n_experts        = n_experts,
-            n_experts_used=n_experts_used,
+            n_experts_used   = n_experts_used,
-            f_norm_eps=config["norm_eps"],
+            f_norm_eps       = config["norm_eps"],
-            f_rope_freq_base=config.get("rope_theta", f_rope_freq_base),
+            f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
        )
    @staticmethod
-    def load(model_plus: ModelPlus) -> "Params":
+    def load(model_plus: ModelPlus) -> Params:
-        hf_config_path = model_plus.paths[0].parent / "config.json"
+        hf_config_path   = model_plus.paths[0].parent / "config.json"
        orig_config_path = model_plus.paths[0].parent / "params.json"
        if hf_config_path.exists():
-            params = Params.load_transformers_config(model_plus.model, hf_config_path)
+            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
        elif orig_config_path.exists():
-            params = Params.load_torch_params(model_plus.model, orig_config_path)
+            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
-        elif model_plus.format != "none":
+        elif model_plus.format != 'none':
            params = Params.guessed(model_plus.model)
        else:
-            raise ValueError("Cannot guess params when model format is none")
+            raise ValueError('Cannot guess params when model format is none')
        params.path_model = model_plus.paths[0].parent
        return params
-class BpeVocab:  # GPT
+#
-    def __init__(
+# vocab
-        self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
+#
-    ) -> None:
+
-        self.bpe_tokenizer = json.loads(
+class BpeVocab:
-            open(str(fname_tokenizer), encoding="utf-8").read()
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
-        )
+        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
        self.vocab = self.bpe_tokenizer["model"]["vocab"]
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
@ -394,34 +341,31 @@ class BpeVocab:  # GPT
            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
        else:
            # Fall back to trying to find the added tokens in tokenizer.json
-            tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json"
+            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
            if not tokenizer_json_file.is_file():
                added_tokens = {}
            else:
                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
                added_tokens = dict(
-                    (item["content"], item["id"])
+                    (item['content'], item['id'])
-                    for item in tokenizer_json.get("added_tokens", [])
+                    for item in tokenizer_json.get('added_tokens', [])
                    # Added tokens here can be duplicates of the main vocabulary.
-                    if item["content"] not in self.bpe_tokenizer
+                    if item['content'] not in self.bpe_tokenizer)
                )
        vocab_size: int = len(self.vocab)
-        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids = sorted(added_tokens.values())
+        actual_ids      = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            expected_end_id = vocab_size + len(actual_ids) - 1
-            raise Exception(
+            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
                f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}"
            )
        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_dict = added_tokens
+        self.added_tokens_dict    = added_tokens
-        self.added_tokens_list = [text for (text, idx) in items]
+        self.added_tokens_list    = [text for (text, idx) in items]
        self.vocab_size_base: int = vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
+        self.fname_tokenizer      = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
+        self.fname_added_tokens   = fname_added_tokens
    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@ -442,10 +386,8 @@ class BpeVocab:  # GPT
        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-class SentencePieceVocab:  # LlaMa
+class SentencePieceVocab:
-    def __init__(
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
        self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
    ) -> None:
        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
@ -455,23 +397,19 @@ class SentencePieceVocab:  # LlaMa
        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
-        new_tokens = {
+        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
            id: piece for piece, id in added_tokens.items() if id >= vocab_size
        }
        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
-        actual_new_ids = sorted(new_tokens.keys())
+        actual_new_ids   = sorted(new_tokens.keys())
        if expected_new_ids != actual_new_ids:
-            raise ValueError(
+            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
                f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}"
            )
        # Token pieces that were added to the base vocabulary.
        self.added_tokens_dict = added_tokens
-        self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
+        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
-        self.vocab_size_base = vocab_size
+        self.vocab_size_base    = vocab_size
-        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
+        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
+        self.fname_tokenizer    = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens
    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@ -512,11 +450,15 @@ class SentencePieceVocab:  # LlaMa
 class HfVocab:
-    def __init__(
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
-        self,
+        try:
-        fname_tokenizer: Path,
+            from transformers import AutoTokenizer
-        fname_added_tokens: Optional[Path] = None,
+        except ImportError as e:
-    ) -> None:
+            raise ImportError(
                "To use HfVocab, please install the `transformers` package. "
                "You can install it with `pip install transformers`."
            ) from e
        print("fname_tokenizer:", fname_tokenizer)
        # Allow the tokenizer to default to slow or fast versions.
        # Explicitly set tokenizer to use local paths.
@ -529,7 +471,7 @@ class HfVocab:
        # Initialize lists and dictionaries for added tokens
        self.added_tokens_list = []
        self.added_tokens_dict = dict()
-        self.added_tokens_ids = set()
+        self.added_tokens_ids  = set()
        # Process added tokens
        for tok, tokidx in sorted(
@ -550,12 +492,12 @@ class HfVocab:
        # Set vocabulary sizes
        self.vocab_size_base = self.tokenizer.vocab_size
-        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
+        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
+        self.fname_tokenizer    = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens
-    def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {
            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
        }
@ -573,11 +515,9 @@ class HfVocab:
                token_id, self.special_ids  # Reuse already stored special IDs
            )
-    def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType:
+    def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
        # Determine token type based on whether it's a special token
-        return (
+        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
            gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
        )
    def get_token_score(self, token_id: int) -> float:
        # Placeholder for actual logic to determine the token's score
@ -589,7 +529,6 @@ class HfVocab:
            if text in self.specials:
                toktype = self.get_token_type(self.specials[text], self.special_ids)
                score = self.get_token_score(self.specials[text])
            else:
                toktype = gguf.TokenType.USER_DEFINED
                score = -1000.0
@ -783,7 +722,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
    else:
        model = merge_sharded([mp.model for mp in models_plus])
-    return ModelPlus(model, paths, format, vocab)
+    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types
 def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@ -871,17 +810,13 @@ class LazyUnpickler(pickle.Unpickler):
    CLASSES: dict[tuple[str, str], Any] = {
        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
-        ("torch._tensor", "_rebuild_from_type_v2"): getattr(
+        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
-            rebuild_from_type_v2, "__func__"
+        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
-        ),
+        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
-        ("torch._utils", "_rebuild_tensor_v2"): getattr(
+        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
-            lazy_rebuild_tensor_v2, "__func__"
+        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
-        ),
+        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
-        ("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16),
+        ('torch', 'Tensor'): LazyTensor,
        ("torch", "HalfStorage"): LazyStorageKind(DT_F16),
        ("torch", "FloatStorage"): LazyStorageKind(DT_F32),
        ("torch", "IntStorage"): LazyStorageKind(DT_I32),
        ("torch", "Tensor"): LazyTensor,
    }
    def find_class(self, module: str, name: str) -> Any:
@ -968,7 +903,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
        executor_class = ProcessPoolExecutor
    else:
        executor_class = ThreadPoolExecutor
-    with executor_class(max_workers = max_workers) as executor:
+    with executor_class(max_workers=max_workers) as executor:
        futures: list[concurrent.futures.Future[Out]] = []
        done = False
        for _ in range(concurrency):
@ -1022,12 +957,8 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
 class OutputFile:
-    def __init__(
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
-        self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
    ) -> None:
        self.gguf = gguf.GGUFWriter(
            fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess
        )
    def add_meta_arch(self, params: Params) -> None:
        name = "LLaMA"
@ -1036,21 +967,16 @@ class OutputFile:
        if params.n_ctx == 4096:
            name = "LLaMA v2"
        elif params.path_model is not None:
-            name = str(params.path_model.parent).split("/")[-1]
+            name = str(params.path_model.parent).split('/')[-1]
-        self.gguf.add_name(name)
+        self.gguf.add_name                (name)
-        self.gguf.add_context_length(params.n_ctx)
+        self.gguf.add_context_length      (params.n_ctx)
-        self.gguf.add_embedding_length(params.n_embd)
+        self.gguf.add_embedding_length    (params.n_embd)
-        self.gguf.add_block_count(params.n_layer)
+        self.gguf.add_block_count         (params.n_layer)
-        self.gguf.add_feed_forward_length(params.n_ff)
+        self.gguf.add_feed_forward_length (params.n_ff)
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
-        self.gguf.add_head_count(params.n_head)
+        self.gguf.add_head_count          (params.n_head)
-        self.gguf.add_head_count_kv(params.n_head_kv)
+        self.gguf.add_head_count_kv       (params.n_head_kv)
        if params.f_norm_eps is None:
            raise ValueError("f_norm_eps is None")
        self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
        if params.n_experts:
            self.gguf.add_expert_count(params.n_experts)
@ -1058,6 +984,11 @@ class OutputFile:
        if params.n_experts_used:
            self.gguf.add_expert_used_count(params.n_experts_used)
        if params.f_norm_eps:
            self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
        else:
            raise ValueError('f_norm_eps is None')
        if params.f_rope_freq_base is not None:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)
@ -1089,7 +1020,7 @@ class OutputFile:
        return tokenizer_model
-    def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]:
+    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
        tokens = []
        scores = []
        toktypes = []
@ -1124,14 +1055,10 @@ class OutputFile:
    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
        n_elements = int(np.prod(tensor.shape))
-        raw_dtype = getattr(tensor.data_type, "ggml_type", None)
+        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
-        data_type = (
+        data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
            getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype
        )
        data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
-        self.gguf.add_tensor_info(
+        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
            name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype
        )
    def write_meta(self) -> None:
        self.gguf.write_header_to_file()
@ -1145,14 +1072,10 @@ class OutputFile:
    @staticmethod
    def write_vocab_only(
-        fname_out: Path,
+        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        params: Params,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
        vocab: Vocab,
        svocab: gguf.SpecialVocab,
        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
    ) -> None:
-        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
+        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
        of = OutputFile(fname_out, endianess=endianess)
@ -1180,14 +1103,8 @@ class OutputFile:
    @staticmethod
    def write_all(
-        fname_out: Path,
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
-        ftype: GGMLFileType,
+        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        params: Params,
        model: LazyModel,
        vocab: Vocab,
        svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY,
        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -1207,26 +1124,19 @@ class OutputFile:
        of.write_tensor_info()
        # tensor data
-        ndarrays_inner = bounded_parallel_map(
+        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
            OutputFile.do_item, model.items(), concurrency=concurrency
        )
        if ftype == GGMLFileType.MostlyQ8_0:
            ndarrays = bounded_parallel_map(
-                OutputFile.maybe_do_quantize,
+                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
                ndarrays_inner,
                concurrency=concurrency,
                max_workers=concurrency,
                use_processpool_executor=True,
            )
        else:
            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
        start = time.time()
-        for i, ((name, lazy_tensor), ndarray) in enumerate(
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
            zip(model.items(), ndarrays)
        ):
            elapsed = time.time() - start
-            size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
            padi = len(str(len(model)))
            print(
                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
@ -1363,7 +1273,7 @@ def load_some_model(path: Path) -> ModelPlus:
 class VocabFactory:
    def __init__(self, path: Path):
        self.path = path
-        self.files = {
+        self.files: dict[str, Path | None] = {
            "tokenizer.model": None,
            "vocab.json": None,
            "tokenizer.json": None,
@ -1380,24 +1290,18 @@ class VocabFactory:
                self.files[file] = parent_file_path
        print(f"Found vocab files: {self.files}")
-    def _select_file(self, vocabtype: Optional[str]) -> Path:
+    def _select_file(self, vocabtype: str | None) -> Path:
        if vocabtype in ["spm", "bpe"]:
            for file_key in self.files.keys():
-                if self.files[file_key]:
+                if (file := self.files[file_key]) is not None:
-                    return self.files[file_key]
+                    return file
            raise FileNotFoundError(f"{vocabtype} vocab not found.")
-        elif vocabtype == "hfft":
+        if vocabtype == "hfft":
            # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
            return self.path
-        else:
+        raise ValueError(f"Unsupported vocabulary type {vocabtype}")
            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
-    def _create_special_vocab(
+    def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
        self,
        vocab: Vocab,
        vocabtype: str,
        model_parent_path: Path,
    ) -> gguf.SpecialVocab:
        load_merges = vocabtype == "bpe"
        n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
        return gguf.SpecialVocab(
@ -1407,13 +1311,12 @@ class VocabFactory:
            n_vocab=n_vocab,
        )
-    def load_vocab(
+    def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
        self, vocabtype: str, model_parent_path: Path
    ) -> Tuple[Vocab, gguf.SpecialVocab]:
        path = self._select_file(vocabtype)
        print(f"Loading vocab file '{path}', type '{vocabtype}'")
        added_tokens_path = path.parent / "added_tokens.json"
        vocab: Vocab
        if vocabtype == "bpe":
            vocab = BpeVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
@ -1428,6 +1331,7 @@ class VocabFactory:
            )
        else:
            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
        # FIXME: Respect --vocab-dir?
        special_vocab = self._create_special_vocab(
            vocab,
            vocabtype,
@ -1436,18 +1340,17 @@ class VocabFactory:
        return vocab, special_vocab
-def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path:
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
    namestr = {
-        GGMLFileType.AllF32: "f32",
+        GGMLFileType.AllF32:    "f32",
        GGMLFileType.MostlyF16: "f16",
-        GGMLFileType.MostlyQ8_0: "q8_0",
+        GGMLFileType.MostlyQ8_0:"q8_0",
    }[file_type]
    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
    if ret in model_paths:
        sys.stderr.write(
            f"Error: Default output path ({ret}) would overwrite the input. "
-            "Please explicitly specify a path using --outfile.\n"
+            "Please explicitly specify a path using --outfile.\n")
        )
        sys.exit(1)
    return ret
@ -1457,111 +1360,34 @@ def do_dump_model(model_plus: ModelPlus) -> None:
    print(f"model_plus.format = {model_plus.format!r}")
    print(f"model_plus.vocab = {model_plus.vocab!r}")
    for name, lazy_tensor in model_plus.model.items():
-        print(
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
            f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}"
        )
-def get_argument_parser() -> ArgumentParser:
+def main(args_in: list[str] | None = None) -> None:
    output_choices = ["f32", "f16"]
    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
        # We currently only support Q8_0 output on little endian systems.
        output_choices.append("q8_0")
    vocab_types = ["spm", "bpe", "hfft"]
    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
    parser.add_argument("--awq-path",    type=Path,              help="Path to scale awq cache file", default=None)
    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
    parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
    parser.add_argument("--vocab-type",  choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
    parser.add_argument("--big-endian",  action="store_true",    help="model is executed on big endian machine")
    parser.add_argument("--pad-vocab",   action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
-    parser = argparse.ArgumentParser(
+    args = parser.parse_args(args_in)
        description="Convert a LLaMa model to a GGML compatible file"
    )
    parser.add_argument(
        "model",
        type=Path,
        help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)",
    )
    parser.add_argument(
        "--awq-path",
        type=Path,
        help="Path to the Activation-aware Weight Quantization cache file",
        default=None,
    )
    parser.add_argument(
        "--dump",
        action="store_true",
        help="Display the model content without converting it",
    )
    parser.add_argument(
        "--dump-single",
        action="store_true",
        help="Display the content of a single model file without conversion",
    )
    parser.add_argument(
        "--vocab-only",
        action="store_true",
        help="Extract and output only the vocabulary",
    )
    parser.add_argument(
        "--outtype",
        choices=output_choices,
        help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)",
    )
    parser.add_argument(
        "--vocab-dir",
        type=Path,
        help="Directory containing the tokenizer.model, if separate from the model file",
    )
    parser.add_argument(
        "--vocab-type",
        choices=["spm", "bpe", "hfft"],  # hfft: Hugging Face Fast Tokenizer
        default="spm",
        help="The vocabulary format used to define the tokenizer model (default: spm)",
    )
    parser.add_argument(
        "--pad-vocab",
        action="store_true",
        help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata",
    )
    parser.add_argument(
        "--outfile",
        type=Path,
        help="Specify the path for the output file (default is based on input)",
    )
    parser.add_argument(
        "--ctx", type=int, help="Model training context (default is based on input)"
    )
    parser.add_argument(
        "--concurrency",
        type=int,
        help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})",
        default=DEFAULT_CONCURRENCY,
    )
    parser.add_argument(
        "--big-endian",
        action="store_true",
        help="Indicate that the model is executed on a big-endian machine",
    )
    return parser
 def main(argv: Optional[list[str]] = None) -> None:
    parser = get_argument_parser()
    args = parser.parse_args(argv)
    if args.awq_path:
-        sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py"))
+        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-        from awq.apply_awq import add_scale_weights
+        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
        tmp_model_path = args.model / "weighted_model"
        if tmp_model_path.is_dir():
            print(f"{tmp_model_path} exists as a weighted model.")
@ -1580,14 +1406,11 @@ def main(argv: Optional[list[str]] = None) -> None:
    if not args.vocab_only:
        model_plus = load_some_model(args.model)
    else:
-        model_plus = ModelPlus(
+        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
            model={}, paths=[args.model / "dummy"], format="none", vocab=None
        )
    if args.dump:
        do_dump_model(model_plus)
        return
    endianess = gguf.GGUFEndian.LITTLE
    if args.big_endian:
        endianess = gguf.GGUFEndian.BIG
@ -1595,12 +1418,10 @@ def main(argv: Optional[list[str]] = None) -> None:
    params = Params.load(model_plus)
    if params.n_ctx == -1:
        if args.ctx is None:
-            raise Exception(
+            raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
-                "The model doesn't have a context size, and you didn't specify one with --ctx\n"
+                            "Please specify one with --ctx:\n"
-                "Please specify one with --ctx:\n"
+                            " - LLaMA v1: --ctx 2048\n"
-                " - LLaMA v1: --ctx 2048\n"
+                            " - LLaMA v2: --ctx 4096\n")
                " - LLaMA v2: --ctx 4096\n"
            )
        params.n_ctx = args.ctx
    if args.outtype:
@ -1621,42 +1442,30 @@ def main(argv: Optional[list[str]] = None) -> None:
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        outfile = args.outfile
-        OutputFile.write_vocab_only(
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-            outfile,
+                                    endianess=endianess, pad_vocab=args.pad_vocab)
            params,
            vocab,
            special_vocab,
            endianess=endianess,
            pad_vocab=args.pad_vocab,
        )
        print(f"Wrote {outfile}")
        return
    if model_plus.vocab is not None and args.vocab_dir is None:
        vocab = model_plus.vocab
-    model = model_plus.model
+    print(f"Vocab info: {vocab}")
-    model = convert_model_names(model, params)
+    print(f"Special vocab info: {special_vocab}")
-    ftype = pick_output_type(model, args.outtype)
+
-    model = convert_to_output_type(model, ftype)
+    model   = model_plus.model
-    outfile = args.outfile or default_output_file(model_plus.paths, ftype)
+    model   = convert_model_names(model, params)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
    outfile = args.outfile or default_outfile(model_plus.paths, ftype)
    params.ftype = ftype
    print(f"Writing {outfile}, format {ftype}")
-    OutputFile.write_all(
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-        outfile,
+                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
        ftype,
        params,
        model,
        vocab,
        special_vocab,
        concurrency=args.concurrency,
        endianess=endianess,
        pad_vocab=args.pad_vocab,
    )
    print(f"Wrote {outfile}")
-if __name__ == "__main__":
+if __name__ == '__main__':
-    main(sys.argv[1:])  # Exclude the first element (script name) from sys.argv
+    main()
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -1800,6 +1800,8 @@ int main(int argc, char ** argv) {
    std::vector<size_t> train_samples_begin;
    std::vector<size_t> train_samples_size;
    printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
    printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
    printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
    tokenize_file(lctx,
            params.common.fn_train_data,
            params.common.sample_start,
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -26,6 +26,7 @@ struct StatParams {
    std::string ofile = "imatrix.dat";
    int         n_output_frequency = 10;
    int         verbosity = 1;
    int         keep_every = 0;
    bool        collect_output_weight = false;
 };
@ -42,6 +43,9 @@ private:
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
                                                  //
    void save_imatrix(const char * file_name) const;
    void keep_imatrix(int ncall) const;
 };
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@ -117,6 +121,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                if (m_last_call % m_params.n_output_frequency == 0) {
                    save_imatrix();
                }
                if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
                    keep_imatrix(m_last_call);
                }
            }
        }
    } else {
@ -143,6 +150,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            if (m_last_call % m_params.n_output_frequency == 0) {
                save_imatrix();
            }
            if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
                keep_imatrix(m_last_call);
            }
        }
    }
@ -150,7 +160,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
 }
 void IMatrixCollector::save_imatrix() const {
-    const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
+    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
 }
 void IMatrixCollector::keep_imatrix(int ncall) const {
    auto file_name = m_params.ofile;
    if (file_name.empty()) file_name = "imatrix.dat";
    file_name += ".at_";
    file_name += std::to_string(ncall);
    save_imatrix(file_name.c_str());
 }
 void IMatrixCollector::save_imatrix(const char * fname) const {
    std::ofstream out(fname, std::ios::binary);
    int n_entries = m_stats.size();
    out.write((const char*)&n_entries, sizeof(n_entries));
@ -248,7 +269,7 @@ static void process_logits(
    }
 }
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    const int n_ctx = llama_n_ctx(ctx);
@ -269,10 +290,12 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    }
    std::vector<float> logit_history;
    logit_history.resize(tokens.size());
    std::vector<float> prob_history;
-    prob_history.resize(tokens.size());
+
    if (compute_ppl) {
        logit_history.resize(tokens.size());
        prob_history.resize(tokens.size());
    }
    const int n_chunk_max = tokens.size() / n_ctx;
@ -288,12 +311,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
    std::vector<float> logits;
    if (compute_ppl && num_batches > 1) {
        logits.reserve((size_t)n_ctx * n_vocab);
    }
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;
        const int num_batches = (n_ctx + n_batch - 1) / n_batch;
        std::vector<float> logits;
        const auto t_start = std::chrono::high_resolution_clock::now();
@ -321,8 +349,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;
-            const auto * batch_logits = llama_get_logits(ctx);
+            if (compute_ppl && num_batches > 1) {
-            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+                const auto * batch_logits = llama_get_logits(ctx);
                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
            }
        }
        const auto t_end = std::chrono::high_resolution_clock::now();
@ -338,25 +368,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }
-        const int first = n_ctx/2;
+        if (compute_ppl) {
-        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+            const int first = n_ctx/2;
-                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
+            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-        count += n_ctx - first - 1;
+            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
            count += n_ctx - first - 1;
-        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
-        fflush(stdout);
+            fflush(stdout);
            logits.clear();
        }
    }
    printf("\n");
-    nll2 /= count;
+    if (compute_ppl) {
-    nll /= count;
+        nll2 /= count;
-    const double ppl = exp(nll);
+        nll /= count;
-    nll2 -= nll * nll;
+        const double ppl = exp(nll);
-    if (nll2 > 0) {
+        nll2 -= nll * nll;
-        nll2 = sqrt(nll2/(count-1));
+        if (nll2 > 0) {
-        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+            nll2 = sqrt(nll2/(count-1));
-    } else {
+            printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
-        printf("Unexpected negative standard deviation of log(prob)\n");
+        } else {
            printf("Unexpected negative standard deviation of log(prob)\n");
        }
    }
    return true;
@ -365,6 +402,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 int main(int argc, char ** argv) {
    StatParams sparams;
    bool compute_ppl = true;
    std::vector<char*> args;
    args.push_back(argv[0]);
    int iarg = 1;
@ -381,12 +419,21 @@ int main(int argc, char ** argv) {
        }
        else if (arg == "--verbosity") {
            sparams.verbosity = std::stoi(argv[++iarg]);
        } else if (arg == "--no-ppl") {
            compute_ppl = false;
        } else if (arg == "--keep-imatrix") {
            sparams.keep_every = std::stoi(argv[++iarg]);
        } else {
            args.push_back(argv[iarg]);
        }
    }
    if (iarg < argc) {
-        args.push_back(argv[iarg]);
+        std::string arg{argv[iarg]};
        if (arg == "--no-ppl") {
            compute_ppl = false;
        } else {
            args.push_back(argv[iarg]);
        }
    }
    gpt_params params;
@ -448,7 +495,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }
-    bool OK = compute_imatrix(ctx, params);
+    bool OK = compute_imatrix(ctx, params, compute_ppl);
    if (!OK) {
        return 1;
    }
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -0,0 +1,131 @@
 # MobileVLM
 Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
 for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
 The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
 ## Usage
 Build with cmake or run `make llava-cli` to build it.
 After building, run: `./llava-cli` to see the usage. For example:
 ```sh
 ./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
    --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
    --image path/to/an/image.jpg \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
 ```
 ## Model conversion
 - Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
 ```sh
 git clone https://huggingface.co/mtgv/MobileVLM-1.7B
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```
 2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
 ```sh
 python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
 ```
 3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
 ```sh
 python ./examples/llava/convert-image-encoder-to-gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
    --output-dir path/to/MobileVLM-1.7B \
    --projector-type ldp
 ```
 4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
 ```sh
 python ./convert.py path/to/MobileVLM-1.7B
 ```
 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
 ```sh
 ./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
 ```
 Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
 ## Android compile and run
 ### compile
 refer to `examples/llava/android/build_64.sh`
 ```sh
 mkdir examples/llava/android/build_64
 cd examples/llava/android/build_64
 ../build_64.sh
 ```
 ### run on Android
 refer to `android/adb_run.sh`, modify resources' `name` and `path`
 ## some result on Android with `Snapdragon 888` chip
 ### case 1
 **input**
 ```sh
 /data/local/tmp/llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
    --image /data/local/tmp/demo.jpg \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
 ```
 **output**
 ```sh
 encode_image_with_clip: image encoded in 21148.71 ms by CLIP (  146.87 ms per image patch)
 Susan Wise Bauer
 llama_print_timings:        load time =   23574.72 ms
 llama_print_timings:      sample time =       1.24 ms /     6 runs   (    0.21 ms per token,  4850.44 tokens per second)
 llama_print_timings: prompt eval time =   12460.15 ms /   246 tokens (   50.65 ms per token,    19.74 tokens per second)
 llama_print_timings:        eval time =     424.86 ms /     6 runs   (   70.81 ms per token,    14.12 tokens per second)
 llama_print_timings:       total time =   34731.93 ms
 ```
 ### case 2
 **input**
 ```sh
 /data/local/tmp/llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
    --image /data/local/tmp/cat.jpeg \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
 ```
 **output**
 ```sh
 encode_image_with_clip: image encoded in 21149.51 ms by CLIP (  146.87 ms per image patch)
 The image depicts a cat sitting in the grass near some tall green plants.
 llama_print_timings:        load time =   23257.32 ms
 llama_print_timings:      sample time =       5.25 ms /    18 runs   (    0.29 ms per token,  3430.53 tokens per second)
 llama_print_timings: prompt eval time =   11900.73 ms /   232 tokens (   51.30 ms per token,    19.49 tokens per second)
 llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 ms per token,    14.07 tokens per second)
 llama_print_timings:       total time =   34570.79 ms
 ```
 ## Minor shortcomings
 The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
 ## TODO
 - [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
 - [ ] Optimize LDP projector performance
      - Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
      - Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
 - [ ] run MobileVLM on `Jetson Orin`
 - [ ] Support more model variants, such as `MobileVLM-3B`.
 ## contributor
 ```sh
 zhangjidong05, yangyang260, huyiming03, chenxiaotao03
 ```
--- a/examples/llava/android/adb_run.sh
+++ b/examples/llava/android/adb_run.sh
@ -0,0 +1,53 @@
 #!/bin/bash
 model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
 projector_name="mmproj-model-f16.gguf"
 llama_name="ggml-model-q4_k.gguf"
 img_dir="/Users/cxt/model/llm"
 img_name="demo.jpg"
 prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
 # img_name="cat.jpeg"
 # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
 program_dir="build_64/bin"
 binName="llava-cli"
 n_threads=4
 deviceDir="/data/local/tmp"
 saveDir="output"
 if [ ! -d ${saveDir} ]; then
    mkdir ${saveDir}
 fi
 function android_run() {
    # # copy resource into device
    # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
    # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
    adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
    # copy program into device
    adb push ${program_dir}/${binName} ${deviceDir}/${binName}
    adb shell "chmod 0777 ${deviceDir}/${binName}"
    # run
    adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
                                                 -m ${deviceDir}/${llama_name} \
                                                 --mmproj ${deviceDir}/${projector_name} \
                                                 -t ${n_threads} \
                                                 --image ${deviceDir}/${img_name} \
                                                 -p \"${prompt}\" \
                                                 > ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
    adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
                                                 -m ${deviceDir}/${llama_name} \
                                                 --mmproj ${deviceDir}/${projector_name} \
                                                 -t ${n_threads} \
                                                 --image ${deviceDir}/${img_name} \
                                                 -p \"${prompt}\" \
                                                 >> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
    adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
 }
 android_run
 echo "android_run is Done!"
--- a/examples/llava/android/build_64.sh
+++ b/examples/llava/android/build_64.sh
@ -0,0 +1,8 @@
 #!/bin/bash
 cmake ../../../../ \
 -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
 -DCMAKE_BUILD_TYPE=Release \
 -DANDROID_ABI="arm64-v8a" \
 -DANDROID_PLATFORM=android-23 $1
 make -j4
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -12,6 +12,7 @@
 #include <regex>
 #include <stdexcept>
 #include <vector>
 #include <sstream>
 #include "clip.h"
 #include "ggml.h"
@ -67,6 +68,7 @@ static std::string format(const char * fmt, ...) {
 #define KEY_PATCH_SIZE "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN "clip.vision.image_mean"
 #define KEY_IMAGE_STD "clip.vision.image_std"
 #define KEY_PROJ_TYPE "clip.projector_type"
 //
 // tensor name constants
@ -89,6 +91,21 @@ static std::string format(const char * fmt, ...) {
 #define TN_TEXT_PROJ "text_projection.weight"
 #define TN_VIS_PROJ "visual_projection.weight"
 #define TN_LLAVA_PROJ "mm.%d.%s"
 #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
 enum projector_type {
    PROJECTOR_TYPE_MLP,
    PROJECTOR_TYPE_LDP,
    PROJECTOR_TYPE_UNKNOWN,
 };
 static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_MLP,           "mlp"     },
    { PROJECTOR_TYPE_LDP,          "ldp"    },
 };
 //
 // utilities to get data from a gguf file
@ -129,6 +146,91 @@ static std::string get_ftype(int ftype) {
    return ggml_type_name(static_cast<ggml_type>(ftype));
 }
 static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
    switch (type) {
        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
        default:                return format("unknown type %d", type);
    }
 }
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    std::string result;
    for (size_t pos = 0; ; pos += search.length()) {
        auto new_pos = s.find(search, pos);
        if (new_pos == std::string::npos) {
            result += s.substr(pos, s.size() - pos);
            break;
        }
        result += s.substr(pos, new_pos - pos) + replace;
        pos = new_pos;
    }
    s = std::move(result);
 }
 static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
    switch (type) {
        case GGUF_TYPE_STRING:
            return gguf_get_val_str(ctx_gguf, i);
        case GGUF_TYPE_ARRAY:
            {
                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
                int arr_n = gguf_get_arr_n(ctx_gguf, i);
                const void * data = gguf_get_arr_data(ctx_gguf, i);
                std::stringstream ss;
                ss << "[";
                for (int j = 0; j < arr_n; j++) {
                    if (arr_type == GGUF_TYPE_STRING) {
                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
                        // escape quotes
                        replace_all(val, "\\", "\\\\");
                        replace_all(val, "\"", "\\\"");
                        ss << '"' << val << '"';
                    } else if (arr_type == GGUF_TYPE_ARRAY) {
                        ss << "???";
                    } else {
                        ss << gguf_data_to_str(arr_type, data, j);
                    }
                    if (j < arr_n - 1) {
                        ss << ", ";
                    }
                }
                ss << "]";
                return ss.str();
            }
        default:
            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
    }
 }
 static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
    size_t tensor_size = ggml_nbytes(tensor);
    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%d, %d, %d, %d], type: %d\n",
            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->type);
 }
 static projector_type clip_projector_type_from_string(const std::string & name) {
    for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
        if (kv.second == name) {
            return kv.first;
        }
    }
    return PROJECTOR_TYPE_UNKNOWN;
 }
 //
 // image data
 //
@ -205,6 +307,32 @@ struct clip_vision_model {
    struct ggml_tensor * mm_0_b;
    struct ggml_tensor * mm_2_w;
    struct ggml_tensor * mm_2_b;
    // MobileVLM projection
    struct ggml_tensor * mm_model_mlp_1_w;
    struct ggml_tensor * mm_model_mlp_1_b;
    struct ggml_tensor * mm_model_mlp_3_w;
    struct ggml_tensor * mm_model_mlp_3_b;
    struct ggml_tensor * mm_model_block_1_block_0_0_w;
    struct ggml_tensor * mm_model_block_1_block_0_1_w;
    struct ggml_tensor * mm_model_block_1_block_0_1_b;
    struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
    struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
    struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
    struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
    struct ggml_tensor * mm_model_block_1_block_2_0_w;
    struct ggml_tensor * mm_model_block_1_block_2_1_w;
    struct ggml_tensor * mm_model_block_1_block_2_1_b;
    struct ggml_tensor * mm_model_block_2_block_0_0_w;
    struct ggml_tensor * mm_model_block_2_block_0_1_w;
    struct ggml_tensor * mm_model_block_2_block_0_1_b;
    struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
    struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
    struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
    struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
    struct ggml_tensor * mm_model_block_2_block_2_0_w;
    struct ggml_tensor * mm_model_block_2_block_2_1_w;
    struct ggml_tensor * mm_model_block_2_block_2_1_b;
 };
 struct clip_ctx {
@ -213,6 +341,7 @@ struct clip_ctx {
    bool has_llava_projector = false;
    struct clip_vision_model vision_model;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
    float image_mean[3];
    float image_std[3];
@ -430,16 +559,135 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            free(patches_data);
        }
        // shape [1, 576, 1024]
        // ne is whcn, ne = [1024, 576, 1, 1]
        embeddings = ggml_get_rows(ctx0, embeddings, patches);
-        // mm projection 0
+        // print_tensor_info(embeddings, "embeddings");
        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-        embeddings = ggml_gelu(ctx0, embeddings);
+        // llava projector
        if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
-        embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+            embeddings = ggml_gelu(ctx0, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+
            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
        }
        else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projector
            int n_patch = 24;
            struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
            mlp_1 = ggml_gelu(ctx0, mlp_1);
            struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
            // block 1
            struct ggml_tensor * block_1 = nullptr;
            {
                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
                mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
                mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                // stride = 1, padding = 1, bias is nullptr
                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, nullptr, 1, 1, 1, 1, 1, 1);
                // layer norm
                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                // hardswish
                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                // pointwise conv
                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
                block_1 = ggml_relu(ctx0, block_1);
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
                block_1 = ggml_hardsigmoid(ctx0, block_1);
                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
                int w = block_1->ne[0], h = block_1->ne[1];
                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                // residual
                block_1 = ggml_add(ctx0, mlp_3, block_1);
            }
            // block_2
            {
                // stride = 2
                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, nullptr, 2, 2, 1, 1, 1, 1);
                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                // layer norm
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                // hardswish
                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
                // not sure the parameters is right for globalAvgPooling
                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                // pointwise conv
                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
                block_1 = ggml_relu(ctx0, block_1);
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
                block_1 = ggml_hardsigmoid(ctx0, block_1);
                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
                int w = block_1->ne[0], h = block_1->ne[1];
                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
            }
            embeddings = block_1;
        }
        else {
            GGML_ASSERT(false);
        }
    }
    // build the graph
@ -485,16 +733,55 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        printf("\n");
    }
    const int n_tensors = gguf_get_n_tensors(ctx);
    // kv
-    if (verbosity >= 3) {
+    const int n_kv = gguf_get_n_kv(ctx);
-        const int n_kv = gguf_get_n_kv(ctx);
+    printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
        __func__, n_kv, n_tensors, fname);
    {
        std::map<enum ggml_type, uint32_t> n_type;
-        for (int i = 0; i < n_kv; ++i) {
+        uint32_t n_type_max = 0;
-            const char * key = gguf_get_key(ctx, i);
+        enum ggml_type type_max = GGML_TYPE_F32;
-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+        for (int i = 0; i < n_tensors; i++) {
            enum ggml_type type = gguf_get_tensor_type(ctx, i);
            n_type[type]++;
            if (n_type_max < n_type[type]) {
                n_type_max = n_type[type];
                type_max   = type;
            }
        }
        printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
        for (int i = 0; i < n_kv; i++) {
            const char * name           = gguf_get_key(ctx, i);
            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
            const std::string type_name =
                type == GGUF_TYPE_ARRAY
                ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
                : gguf_type_name(type);
            std::string value          = gguf_kv_to_str(ctx, i);
            const size_t MAX_VALUE_LEN = 40;
            if (value.size() > MAX_VALUE_LEN) {
                value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
            }
            replace_all(value, "\n", "\\n");
            printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
        }
        // print type counts
        for (auto & kv : n_type) {
            if (kv.second == 0) {
                continue;
            }
            printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
        }
        printf("\n");
    }
    // data
@ -503,20 +790,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
            enum ggml_type type = gguf_get_tensor_type(ctx, i);
            struct ggml_tensor * cur = ggml_get_tensor(meta, name);
            size_t tensor_size = ggml_nbytes(cur);
            buffer_size += tensor_size;
            if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%d, %d, %d, %d], type: %d\n", __func__, i,
-                       ggml_n_dims(cur), cur->name, tensor_size, offset);
+                       ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], type);
            }
        }
    }
    buffer_size += n_tensors * 128 /* CLIP PADDING */;
    clip_ctx * new_clip = new clip_ctx;
    // update projector type
    {
        int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
        if (idx != -1) {
            const std::string proj_type = gguf_get_val_str(ctx, idx);
            new_clip->proj_type = clip_projector_type_from_string(proj_type);
        }
        else {
            new_clip->proj_type = PROJECTOR_TYPE_MLP;
        }
    }
 #ifdef GGML_USE_CUBLAS
    new_clip->backend = ggml_backend_cuda_init(0);
    printf("%s: CLIP using CUDA backend\n", __func__);
@ -661,10 +963,45 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
        vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
        vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-        vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
+
-        vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
+        // LLaVA projection
-        vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
-        vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+            vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
            vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
            vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
            vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
        }
        else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
            vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
            vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
            vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
            vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
            vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
            vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
            vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
            vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
            vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
            vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
            vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
            vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
            vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
            vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
            vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
            vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
            vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
            vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
            vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
            vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
            vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
            vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
            vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
            vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
        }
        else {
            std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
            throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
        }
        vision_model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
@ -1100,13 +1437,25 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 }
 int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
-    return ctx->vision_model.mm_2_b->ne[0];
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
        return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
    }
    else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
        return ctx->vision_model.mm_2_b->ne[0];
    }
    else {
        std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
        throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
    }
 }
 int clip_n_patches(const struct clip_ctx * ctx) {
    auto & params = ctx->vision_model.hparams;
-
+    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
-    return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
        n_patches /= 4;
    }
    return n_patches;
 }
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@ -81,6 +81,7 @@ ap.add_argument("--vision-only", action="store_true", required=False,
 ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@ -174,6 +175,8 @@ elif args.vision_only and not has_llava_projector:
    fout.add_description("vision-only CLIP model")
 elif has_llava_projector:
    fout.add_description("image encoder for LLaVA")
    # add projector type
    fout.add_string("clip.projector_type", args.projector_type)
 else:
    fout.add_description("two-tower CLIP model")
@ -218,7 +221,8 @@ if has_llava_projector:
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
-        if data.ndim == 2:
+        # pw and dw conv ndim==4
        if data.ndim == 2 or data.ndim == 4:
            data = data.squeeze().numpy().astype(np.float16)
        else:
            data = data.squeeze().numpy().astype(np.float32)
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -112,6 +112,43 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
 }
 static inline int nearest_int(float fval) {
    //assert(fval <= 4194303.f);
    float val = fval + 12582912.f;
    int i; memcpy(&i, &val, sizeof(int));
    return (i & 0x007fffff) - 0x00400000;
 }
 static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob, int tok) {
    float max_logit = logits[0];
    float min_logit = logits[0];
    for (int i = 1; i < n_vocab; ++i) {
        max_logit = std::max(max_logit, logits[i]);
        min_logit = std::min(min_logit, logits[i]);
    }
    min_logit = std::max(min_logit, max_logit - 16);
    double sum_exp = 0.0;
    for (int i = 0; i < n_vocab; ++i) {
        sum_exp += expf(logits[i] - max_logit);
    }
    const float log_sum_exp = log(sum_exp);
    const float min_log_prob = min_logit - max_logit - log_sum_exp;
    const float scale = (max_logit - min_logit)/65535.f;
    float * d = (float *)log_prob;
    d[0] = scale;
    d[1] = min_log_prob;
    log_prob += 4;
    if (scale) {
        const float inv_scale = 1/scale;
        for (int i = 0; i < n_vocab; ++i) {
            log_prob[i] = logits[i] > min_logit ? nearest_int(inv_scale*(logits[i] - min_logit)) : 0;
        }
    } else {
        std::memset(log_prob, 0, n_vocab*sizeof(uint16_t));
    }
    return max_logit + log_sum_exp - logits[tok];
 }
 static void process_logits(
    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
    double & nll, double & nll2, float * logit_history, float * prob_history
@ -147,6 +184,114 @@ static void process_logits(
    }
 }
 static void process_logits(std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token,
        std::vector<std::thread> & workers, std::vector<uint16_t> & log_probs, double & nll, double & nll2) {
    std::mutex mutex;
    const int nv = 2*((n_vocab + 1)/2) + 4;
    int counter = 0;
    auto compute = [&mutex, &counter, &log_probs, &nll, &nll2, n_vocab, logits, tokens, n_token, nv] () {
        double local_nll  = 0;
        double local_nll2 = 0;
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
            int i = counter++;
            if (i >= n_token) {
                nll += local_nll; nll2 += local_nll2;
                break;
            }
            lock.unlock();
            const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
            local_nll += v;
            local_nll2 += v*v;
        }
    };
    for (auto & w : workers) {
        w = std::thread(compute);
    }
    compute();
    for (auto & w : workers) {
        w.join();
    }
    out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t));
 }
 struct kl_divergence_result {
    double sum_nll  = 0;
    double sum_nll2 = 0;
    double sum_kld  = 0;
    double sum_kld2 = 0;
    double sum_nll_diff  = 0;
    double sum_nll_diff2 = 0;
    size_t count = 0;
 };
 static void log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
    float max_logit = logits[0];
    for (int i = 1; i < n_vocab; ++i) {
        max_logit = std::max(max_logit, logits[i]);
    }
    double sum_exp = 0.0;
    for (int i = 0; i < n_vocab; ++i) {
        sum_exp += expf(logits[i] - max_logit);
    }
    const float log_sum_exp = log(sum_exp);
    const float * d = (const float *)base_log_prob;
    const float scale = d[0];
    const float min_log_prob = d[1];
    base_log_prob += 4;
    float nll = max_logit + log_sum_exp - logits[tok];
    kld.sum_nll  += nll;
    kld.sum_nll2 += nll*nll;
    nll += (scale*base_log_prob[tok] + min_log_prob);
    kld.sum_nll_diff  += nll;
    kld.sum_nll_diff2 += nll*nll;
    max_logit += log_sum_exp;
    double sum = 0;
    for (int i = 0; i < n_vocab; ++i) {
        const float p_log_base = scale*base_log_prob[i] + min_log_prob;
        if (p_log_base > -16.f) {
            const float p_base = expf(p_log_base);
            sum += p_base * (p_log_base - logits[i] + max_logit);
        }
    }
    kld.sum_kld  += sum;
    kld.sum_kld2 += sum*sum;
    ++kld.count;
 }
 static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
        std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld) {
    std::mutex mutex;
    const int nv = 2*((n_vocab + 1)/2) + 4;
    int counter = 0;
    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv] () {
        kl_divergence_result local_kld;
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
            int i = counter++;
            if (i >= n_token) {
                kld.sum_nll  += local_kld.sum_nll;
                kld.sum_nll2 += local_kld.sum_nll2;
                kld.sum_kld  += local_kld.sum_kld;
                kld.sum_kld2 += local_kld.sum_kld2;
                kld.sum_nll_diff  += local_kld.sum_nll_diff;
                kld.sum_nll_diff2 += local_kld.sum_nll_diff2;
                kld.count += local_kld.count;
                break;
            }
            lock.unlock();
            log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
        }
    };
    for (auto & w : workers) {
        w = std::thread(compute);
    }
    compute();
    for (auto & w : workers) {
        w.join();
    }
 }
 static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
@ -294,6 +439,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    const int n_ctx = llama_n_ctx(ctx);
    std::ofstream logits_stream;
    if (!params.logits_file.empty()) {
        logits_stream.open(params.logits_file.c_str());
        if (!logits_stream.is_open()) {
            fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
            return {};
        }
        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
        logits_stream.write("_logits_", 8);
        logits_stream.write((const char *)&n_ctx, sizeof(n_ctx));
    }
    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
@ -336,6 +493,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
    std::vector<uint16_t> log_probs;
    if (!params.logits_file.empty()) {
        logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
        logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
        logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
        const int nv = 2*((n_vocab + 1)/2) + 4;
        log_probs.resize(n_ctx * nv);
    }
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;
@ -398,8 +564,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        // process the entire prompt.
        const int first = n_ctx/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+        if (!params.logits_file.empty()) {
-                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
+            process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                    workers, log_probs, nll, nll2);
        } else {
            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
        }
        count += n_ctx - first - 1;
        // perplexity is e^(average negative log-likelihood)
@ -458,23 +629,24 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
    return true;
 }
 #define K_TOKEN_CHUNK 4
 static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
        const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
    constexpr int k_token_chunk = 4;
    if (eval_results.size() != eval_pairs.size()) {
        eval_results.resize(eval_pairs.size());
    }
    if (eval_pairs.empty()) return;
-    size_t max_threads = std::min((eval_pairs.size() + k_token_chunk - 1)/k_token_chunk, workers.size());
+    size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
    std::atomic<int> counter(0);
    auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
-        float local_logprobs[k_token_chunk];
+        float local_logprobs[K_TOKEN_CHUNK];
        while (true) {
-            size_t first = counter.fetch_add(k_token_chunk, std::memory_order_relaxed);
+            size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
            if (first >= eval_results.size()) break;
-            size_t last = std::min(first + k_token_chunk, eval_results.size());
+            size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
            for (size_t i = first; i < last; ++i) {
                auto logits = batch_logits + eval_pairs[i].first * n_vocab;
                float max_logit = logits[0];
@ -497,7 +669,6 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
    for (size_t it = 0; it < max_threads; ++it) {
        workers[it].join();
    }
 }
 static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
@ -540,14 +711,14 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    // This is needed as usual for LLaMA models
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    // The tasks should be randomized so the score stabilizes quickly.
    bool randomize_tasks = true;
    // Number of tasks to use when computing the score
    if (params.hellaswag_tasks < hs_task_count) {
        hs_task_count = params.hellaswag_tasks;
    }
    // The tasks should be randomized so the score stabilizes quickly.
    bool randomize_tasks = true;
    // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
    std::mt19937 rng(1);
@ -1031,6 +1202,531 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
    printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
 }
 static bool deserialize_string(std::istream& in, std::string& str) {
    uint32_t size;
    if (!in.read((char *)&size, sizeof(size)).fail()) {
        str.resize(size);
        if (!in.read((char *)str.data(), size).fail()) return true;
    }
    return false;
 }
 struct multiple_choice_answers {
    std::vector<std::string> answers;
    std::vector<int>         labels;
    bool deserialize(std::istream& in) {
        uint32_t n;
        in.read((char *)&n, sizeof(n));
        if (in.fail() || n > 100) return false; // 100 as max. number of answers should be good enough for any practical purpose
        answers.resize(n);
        labels.resize(n);
        for (auto& a : answers) {
            if (!deserialize_string(in, a)) return false;
        }
        in.read((char *)labels.data(), n*sizeof(int));
        return !in.fail();
    }
 };
 struct multiple_choice_task {
    std::string question;         // the question (or context that needs to be continued)
    multiple_choice_answers mc1;  // possible answers (continuations) with a single correct answer
    multiple_choice_answers mc2;  // possible answers (continuations) with multiple correct answers - not handled yet
    bool deserialize(std::istream& in) {
        if (!deserialize_string(in, question)) return false;
        return mc1.deserialize(in) && mc2.deserialize(in);
    }
    // For evaluation
    size_t i_batch;         // starting index in the llama_batch
    size_t common_prefix;   // max number of initial tokens that are the same in all sentences
    size_t required_tokens; // needed number of tokens to evaluate all answers
    std::vector<std::vector<llama_token>> seq_tokens;
    std::vector<float> log_probs;
 };
 static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
    if (task.question.empty() || task.mc1.answers.empty()) {
        if (log_error) {
            printf("%s: found bad task with empty question and/or answers\n", __func__);
        }
        return false;
    }
    task.seq_tokens.reserve(task.mc1.answers.size());
    for (auto& answer : task.mc1.answers) {
        if (answer.empty()) {
            if (log_error) {
                printf("%s: found empty answer\n", __func__);
            }
            return false;
        }
        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
    }
    auto min_len = task.seq_tokens.front().size();
    for (auto& seq : task.seq_tokens) {
        min_len = std::min(min_len, seq.size());
    }
    task.common_prefix = 0;
    for (size_t k = 0; k < min_len; ++k) {
        auto token = task.seq_tokens[0][k];
        bool all_same = true;
        for (size_t i = 1; i < task.seq_tokens.size(); ++i) {
            if (task.seq_tokens[i][k] != token) {
                all_same = false;
                break;
            }
        }
        if (!all_same) {
            break;
        }
        ++task.common_prefix;
    }
    task.required_tokens = task.common_prefix;
    for (auto& seq : task.seq_tokens) {
        task.required_tokens += seq.size() - task.common_prefix;
    }
    return true;
 }
 //
 // Calculates score for multiple choice tasks with single correct answer from prompt.
 // Commonly used LLM evaluation metrics of this type are
 //   * ARC
 //   * HellaSwag
 //   * MMLU
 //   * TruthfulQA
 //
 // Validation datasets for these 4 tests can be found at
 //     https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp
 // The data for these datasets was extracted from
 //     git@hf.co:datasets/allenai/ai2_arc
 //     https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
 //     git@hf.co:datasets/Stevross/mmlu
 //     https://huggingface.co/datasets/truthful_qa
 //
 static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
    std::istringstream strstream(params.prompt);
    uint32_t n_task;
    strstream.read((char *)&n_task, sizeof(n_task));
    if (strstream.fail() || n_task == 0) {
        printf("%s: no tasks\n", __func__);
        return;
    }
    printf("%s: there are %u tasks in prompt\n", __func__, n_task);
    std::vector<uint32_t> task_pos(n_task);
    strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
    if (strstream.fail()) {
        printf("%s: failed to raad task positions from prompt\n", __func__);
        return;
    }
    std::vector<multiple_choice_task> tasks;
    if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
        // Use all tasks
        tasks.resize(n_task);
        printf("%s: reading tasks", __func__);
        int n_dot = n_task/100;
        int i = 0;
        for (auto& task : tasks) {
            ++i;
            if (!task.deserialize(strstream)) {
                printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
                return;
            }
            if (i%n_dot == 0) printf(".");
        }
        printf("done\n");
    }
    else {
        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
        std::mt19937 rng(1);
        std::vector<int> aux(n_task);
        for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
        float scale = 1.f/(1.f + (float)std::mt19937::max());
        tasks.resize(params.multiple_choice_tasks);
        for (auto& task : tasks) {
            int j = (int)(scale * rng() * aux.size());
            int idx = aux[j];
            aux[j] = aux.back();
            aux.pop_back();
            strstream.seekg(task_pos[idx], std::ios::beg);
            if (!task.deserialize(strstream)) {
                printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
                return;
            }
        }
        n_task = params.multiple_choice_tasks;
    }
    // This is needed as usual for LLaMA models
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    printf("%s: preparing task data", __func__);
    fflush(stdout);
    if (n_task > 500) {
        printf("...");
        fflush(stdout);
        std::atomic<int> counter(0);
        std::atomic<int> n_bad(0);
        auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
            int num_tasks = tasks.size();
            int n_bad_local = 0;
            while (true) {
                int first = counter.fetch_add(K_TOKEN_CHUNK);
                if (first >= num_tasks) {
                    if (n_bad_local > 0) n_bad += n_bad_local;
                    break;
                }
                int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
                for (int i = first; i < last; ++i) {
                    if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
                }
            }
        };
        size_t max_thread = std::thread::hardware_concurrency();
        max_thread = std::min(max_thread, (tasks.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK);
        std::vector<std::thread> workers(max_thread-1);
        for (auto& w : workers) w = std::thread(prepare);
        prepare();
        for (auto& w : workers) w.join();
        printf("done\n");
        fflush(stdout);
        int nbad = n_bad;
        if (nbad > 0) {
            printf("%s: found %d malformed tasks\n", __func__, nbad);
            return;
        }
    } else {
        int n_dot = n_task/100;
        int i_task = 0;
        for (auto& task : tasks) {
            ++i_task;
            if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
                return;
            }
            if (i_task%n_dot == 0) {
                printf(".");
                fflush(stdout);
            }
        }
        printf("done\n");
    }
    printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
    printf("\ntask\tacc_norm\n");
    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const int n_ctx   = llama_n_ctx(ctx);
    const int n_batch = params.n_batch;
    const int max_tasks_per_batch = 32;
    const int max_seq = 4*max_tasks_per_batch;
    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
    std::vector<float> tok_logits(n_vocab);
    std::vector<float> batch_logits(n_vocab*n_ctx);
    std::vector<std::pair<size_t, llama_token>> eval_pairs;
    std::vector<float> eval_results;
    std::vector<std::thread> workers(std::thread::hardware_concurrency());
    std::vector<int> batch_indeces;
    int n_done = 0;
    int n_correct = 0;
    int n_tot_answers = 0;
    for (size_t i0 = 0; i0 < tasks.size(); i0++) {
        int n_cur = 0;
        size_t i1 = i0;
        size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
        llama_batch_clear(batch);
        // batch as much tasks as possible into the available context
        // each task has 4 unique seuqnce ids - one for each ending
        // the common prefix is shared among the 4 sequences to save tokens
        // we extract logits only from the last common token and from all ending tokens of each sequence
        int s0 = 0;
        while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
            auto& cur_task = tasks[i1];
            int num_answers = cur_task.seq_tokens.size();
            if (s0 + num_answers > max_seq) {
                break;
            }
            if (int(batch_indeces.size()) != num_answers) {
                batch_indeces.resize(num_answers);
            }
            for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
            for (size_t i = 0; i < cur_task.common_prefix; ++i) {
                //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
                llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
            }
            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
                for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) {
                    llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true);
                }
            }
            s0 += num_answers;
            cur_task.i_batch = i_batch;
            i_batch += cur_task.required_tokens;
            n_cur += cur_task.required_tokens;
            if (++i1 == tasks.size()) {
                break;
            }
        }
        if (i0 == i1) {
            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
            return;
        }
        llama_kv_cache_clear(ctx);
        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
            return;
        }
        // Compute log-probs in parallel
        // First we collect all tasks
        eval_pairs.clear();
        for (size_t i = i0; i < i1; ++i) {
            auto& cur_task = tasks[i];
            size_t li = cur_task.common_prefix;
            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
                    eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]));
                }
                ++li;
            }
        }
        // Then we do the actual calculation
        compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
        size_t ir = 0;
        // compute the logprobs for each ending of the decoded tasks
        for (size_t i = i0; i < i1; ++i) {
            auto & cur_task = tasks[i];
            //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
            //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
            //    if (cur_task.mc1.labels[j] == 1) {
            //        printf("%d", j+1);
            //    }
            //}
            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);
            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float));
            const auto first_probs = softmax(tok_logits);
            cur_task.log_probs.resize(cur_task.seq_tokens.size());
            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
                size_t count = 1;
                float  log_prob  = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
                    //printf("        %zu  %g\n", ir, eval_results[ir]);
                    ++count;
                    log_prob += eval_results[ir++];
                }
                cur_task.log_probs[s] = log_prob / count;
                //printf("        Final: %g\n", log_prob / count);
                //printf("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
            }
            // Find the ending with maximum logprob
            size_t logprob_max_idx = 0;
            float  logprob_max_val = cur_task.log_probs[0];
            for (size_t s = 1; s < cur_task.log_probs.size(); s++) {
                if (cur_task.log_probs[s] > logprob_max_val) {
                    logprob_max_val = cur_task.log_probs[s];
                    logprob_max_idx = s;
                }
            }
            n_tot_answers += cur_task.log_probs.size();
            if (cur_task.mc1.labels[logprob_max_idx] == 1) {
                ++n_correct;
            }
            ++n_done;
            // Print the accumulated accuracy mean x 100
            printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
            fflush(stdout);
        }
        i0 = i1 - 1;
    }
    llama_batch_free(batch);
    if (n_done < 100) return;
    float p = 1.f*n_correct/n_done;
    float sigma = sqrt(p*(1-p)/(n_done-1));
    printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
    p = 1.f*n_done/n_tot_answers;
    sigma = sqrt(p*(1-p)/(n_done-1));
    printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
    printf("\n");
 }
 static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    if (params.logits_file.empty()) {
        fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
        return;
    }
    std::ifstream in(params.logits_file.c_str(), std::ios::binary);
    if (!in) {
        fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
        return;
    }
    {
        char check[9]; check[8] = 0;
        in.read(check, 8);
        if (in.fail() || strncmp("_logits_", check, 8) != 0) {
            fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
            return;
        }
    }
    uint32_t n_ctx;
    in.read((char *)&n_ctx, sizeof(n_ctx));
    if (n_ctx > llama_n_ctx(ctx)) {
        fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
    }
    int n_vocab, n_chunk;
    in.read((char *)&n_vocab, sizeof(n_vocab));
    in.read((char *)&n_chunk, sizeof(n_chunk));
    if (in.fail()) {
        fprintf(stderr, "%s: failed rwading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
        return;
    }
    if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
    }
    std::vector<llama_token> tokens(n_ctx * n_chunk);
    if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
        return;
    }
    const int n_batch = params.n_batch;
    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
    const int nv = 2*((n_vocab + 1)/2) + 4;
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
    std::vector<float> logits;
    if (num_batches > 1) {
        logits.reserve(n_ctx * n_vocab);
    }
    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
    auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) {
        if (count < 1) {
            return std::make_pair(0., 0.);
        }
        double f = sum/count;
        double df = sum2/count - f*f;
        df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
        return std::make_pair(f, df);
    };
    kl_divergence_result kld;
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;
        const auto t_start = std::chrono::high_resolution_clock::now();
        if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
            fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
            return;
        }
        // clear the KV cache
        llama_kv_cache_clear(ctx);
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);
            // save original token and restore it after eval
            const auto token_org = tokens[batch_start];
            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
            }
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;
            if (num_batches > 1) {
                const auto * batch_logits = llama_get_logits(ctx);
                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
            }
        }
        const auto t_end = std::chrono::high_resolution_clock::now();
        if (i == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
            printf("\nchunk        PPL          ln(PPL(Q)/PPL(base))          KL-Divergence\n");
        }
        const int first = n_ctx/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                workers, log_probs_uint16, kld);
        auto ppl           = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
        auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
        auto kl_div        = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
        printf("%4d    %10.4lf    %10.5lf ± %10.5f    %10.5f ± %10.5lf\n", i+1, exp(ppl.first),
                log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second);
        fflush(stdout);
        logits.clear();
    }
    printf("\n");
 }
 int main(int argc, char ** argv) {
    gpt_params params;
@ -1091,6 +1787,10 @@ int main(int argc, char ** argv) {
        hellaswag_score(ctx, params);
    } else if (params.winogrande) {
        winogrande_score(ctx, params);
    } else if (params.multiple_choice) {
        multiple_choice_score(ctx, params);
    } else if (params.kl_divergence) {
        kl_divergence(ctx, params);
    } else {
        results = perplexity(ctx, params);
    }
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
    { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization"   , },
    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1705133751,
+        "lastModified": 1705677747,
-        "narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
+        "narHash": "sha256-eyM3okYtMgYDgmYukoUzrmuoY4xl4FUujnsv/P6I/zI=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
+        "rev": "bbe7d8f876fbbe7c959c90ba2ae2852220573261",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@ -1,3 +1,17 @@
 # The flake interface to llama.cpp's Nix expressions. The flake is used as a
 # more discoverable entry-point, as well as a way to pin the dependencies and
 # expose default outputs, including the outputs built by the CI.
 # For more serious applications involving some kind of customization  you may
 # want to consider consuming the overlay, or instantiating `llamaPackages`
 # directly:
 #
 # ```nix
 # pkgs.callPackage ${llama-cpp-root}/.devops/nix/scope.nix { }`
 # ```
 # Cf. https://jade.fyi/blog/flakes-arent-real/ for a more detailed exposition
 # of the relation between Nix and the Nix Flakes.
 {
  description = "Port of Facebook's LLaMA model in C/C++";
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -1191,6 +1191,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                ggml_tallocr_t src_allocr = node_allocr(src);
                GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
                if (src_allocr != node_allocr) {
                    // create a copy of the input in the split's backend
                    size_t id = hash_id(src);
                    if (sched->node_copies[id][cur_backend_id] == NULL) {
                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
                        sched->node_copies[id][cur_backend_id] = tensor_copy;
                        node_allocr(tensor_copy) = cur_allocr;
                        SET_CAUSE(tensor_copy, "4.cpy");
                        int n_inputs = sched->splits[cur_split].n_inputs++;
                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
                        sched->splits[cur_split].inputs[n_inputs] = src;
                    }
                    node->src[j] = sched->node_copies[id][cur_backend_id];
 #if 0
                    // check if the input is already in the split
                    bool found = false;
                    for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
@ -1206,19 +1224,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
                        sched->splits[cur_split].inputs[n_inputs] = src;
                    }
-
+#endif
                    // create a copy of the input in the split's backend
                    size_t id = hash_id(src);
                    if (sched->node_copies[id][cur_backend_id] == NULL) {
                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
                        sched->node_copies[id][cur_backend_id] = tensor_copy;
                        node_allocr(tensor_copy) = cur_allocr;
                        SET_CAUSE(tensor_copy, "4.cpy");
                    }
                    node->src[j] = sched->node_copies[id][cur_backend_id];
                }
            }
        }
@ -1333,7 +1339,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
        uint64_t compute_start_us = ggml_time_us();
        if (!sched->callback_eval) {
            ggml_backend_graph_compute(split_backend, &split->graph);
-          //ggml_backend_synchronize(split_backend); // necessary to measure compute time
+            //ggml_backend_synchronize(split_backend); // necessary to measure compute time
        } else {
            // similar to ggml_backend_compare_graph_backend
            for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -12,9 +12,6 @@
 #include <vector>
 #include <map>
 #include <array>
 #include "ggml-cuda.h"
 #include "ggml.h"
 #include "ggml-backend-impl.h"
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
@ -118,6 +115,11 @@
 #endif // defined(GGML_USE_HIPBLAS)
 // ggml-cuda need half type so keep ggml headers include at last
 #include "ggml-cuda.h"
 #include "ggml.h"
 #include "ggml-backend-impl.h"
 #define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
 #define CC_PASCAL     600
--- a/ggml.c
+++ b/ggml.c
@ -1418,6 +1418,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
 inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
 inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
 // TODO: optimize performance
 inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 static const float GELU_COEF_A     = 0.044715f;
 static const float GELU_QUICK_COEF = -1.702f;
@ -1776,9 +1779,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
    "GELU",
    "GELU_QUICK",
    "SILU",
    "HARDSWISH",
    "HARDSIGMOID",
 };
-static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
+static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@ -3945,6 +3950,20 @@ struct ggml_tensor * ggml_silu_back(
    return result;
 }
 // ggml hardswish
 struct ggml_tensor * ggml_hardswish(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
 }
 // ggml hardsigmoid
 struct ggml_tensor * ggml_hardsigmoid(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
 }
 // ggml_norm
 static struct ggml_tensor * ggml_norm_impl(
@ -5344,6 +5363,33 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
    return result;
 }
 // ggml_conv_depthwise
 struct ggml_tensor * ggml_conv_depthwise_2d(
    struct ggml_context * ctx,
    struct ggml_tensor * a,
    struct ggml_tensor * b,
    struct ggml_tensor * c,
    int                  s0,
    int                  s1,
    int                  p0,
    int                  p1,
    int                  d0,
    int                  d1) {
    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
                                        s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
    struct ggml_tensor * result =
        ggml_mul_mat(ctx,
                ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1),                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
                ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
    return result;
 }
 // ggml_conv_2d
 // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@ -7764,6 +7810,9 @@ static void ggml_compute_forward_acc_f32(
    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
    if (!inplace && (params->type == GGML_TASK_INIT)) {
        if (params->ith != 0) {
            return;
        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        memcpy(
@ -9333,6 +9382,87 @@ static void ggml_compute_forward_silu_back(
    }
 }
 static void ggml_compute_forward_hardswish_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];
    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));
    for (int i = 0; i < n; i++) {
        ggml_vec_hardswish_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
 }
 static void ggml_compute_forward_hardswish(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_hardswish_f32(params, src0, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 static void ggml_compute_forward_hardsigmoid_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];
    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));
    for (int i = 0; i < n; i++) {
        ggml_vec_hardsigmoid_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
 }
 static void ggml_compute_forward_hardsigmoid(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 // ggml_compute_forward_norm
 static void ggml_compute_forward_norm_f32(
@ -9825,11 +9955,30 @@ static void ggml_compute_forward_mul_mat(
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
-        if (params->ith != 0) {
+        const int64_t ne_plane      = ne01*ne00;
-            return;
+        const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
-        }
+        UNUSED(desired_wsize);
        if (params->type == GGML_TASK_INIT) {
            if (type != GGML_TYPE_F32) {
                assert(params->wsize >= desired_wsize);
                // parallelize by src0 rows
                for (int64_t i13 = 0; i13 < ne13; i13++) {
                    for (int64_t i12 = 0; i12 < ne12; i12++) {
                        // broadcast src0 into src1 across 2nd,3rd dimension
                        const int64_t i03 = i13/r3;
                        const int64_t i02 = i12/r2;
                        const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03;
                              float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                              ggml_to_float_t  const to_float = type_traits[type].to_float;
                        for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
                            to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
                        }
                    }
                }
            }
            return;
        }
@ -9837,9 +9986,14 @@ static void ggml_compute_forward_mul_mat(
            return;
        }
        // perform sgemm, parallelization controlled by blas lib
        if (ith != 0) {
            return;
        }
        const int64_t tgemm0 = ggml_perf_time_us();
        for (int64_t i13 = 0; i13 < ne13; i13++) {
            for (int64_t i12 = 0; i12 < ne12; i12++) {
                // broadcast src0 into src1 across 2nd,3rd dimension
                const int64_t i03 = i13/r3;
                const int64_t i02 = i12/r2;
@ -9848,17 +10002,7 @@ static void ggml_compute_forward_mul_mat(
                      float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
                if (type != GGML_TYPE_F32) {
-                            float * const wdata    = params->wdata;
+                    x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                    ggml_to_float_t const to_float = type_traits[type].to_float;
                    size_t id = 0;
                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
                        to_float((const char *) x + i01*nb01, wdata + id, ne00);
                        id += ne00;
                    }
                    assert(id*sizeof(float) <= params->wsize);
                    x = wdata;
                }
                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@ -9868,6 +10012,7 @@ static void ggml_compute_forward_mul_mat(
                         0.0f,    d, ne01);
            }
        }
        //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@ -9876,6 +10021,9 @@ static void ggml_compute_forward_mul_mat(
 #endif
    if (params->type == GGML_TASK_INIT) {
        if (ith != 0) {
            return;
        }
        if (src1->type != vec_dot_type) {
            char * wdata = params->wdata;
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -10040,6 +10188,9 @@ static void ggml_compute_forward_mul_mat_id(
    #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
   if (params->type == GGML_TASK_INIT) {
        if (ith != 0) {
            return;
        }
        char * wdata = params->wdata;
        if (src1->type != vec_dot_type) {
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -10225,6 +10376,9 @@ static void ggml_compute_forward_out_prod_f32(
            return;
        }
 #endif
        if (ith != 0) {
            return;
        }
        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
        return;
    }
@ -10408,6 +10562,9 @@ static void ggml_compute_forward_out_prod_q_f32(
    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
    if (params->type == GGML_TASK_INIT) {
        if (ith != 0) {
            return;
        }
        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
        return;
    }
@ -10592,6 +10749,9 @@ static void ggml_compute_forward_set_f32(
    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
    if (!inplace && (params->type == GGML_TASK_INIT)) {
        if (params->ith != 0) {
            return;
        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        memcpy(
@ -10916,6 +11076,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
    if (params->type == GGML_TASK_INIT) {
        if (params->ith != 0) {
            return;
        }
        memset(dst->data, 0, ggml_nbytes(dst));
    }
@ -10950,6 +11113,9 @@ static void ggml_compute_forward_get_rows_back_f32(
    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
    if (params->type == GGML_TASK_INIT) {
        if (params->ith != 0) {
            return;
        }
        memset(dst->data, 0, ggml_nbytes(dst));
    }
@ -11087,6 +11253,9 @@ static void ggml_compute_forward_diag_mask_f32(
    GGML_ASSERT(n_past >= 0);
    if (!inplace && (params->type == GGML_TASK_INIT)) {
        if (ith != 0) {
            return;
        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@ -12057,6 +12226,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
    GGML_ASSERT(nb10 == sizeof(float));
    if (params->type == GGML_TASK_INIT) {
        if (ith != 0) {
            return;
        }
        memset(params->wdata, 0, params->wsize);
        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@ -12151,6 +12323,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
    GGML_ASSERT(nb10 == sizeof(float));
    if (params->type == GGML_TASK_INIT) {
        if (ith != 0) {
            return;
        }
        memset(params->wdata, 0, params->wsize);
        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@ -12349,6 +12524,7 @@ static void ggml_compute_forward_im2col(
    }
 }
 // ggml_compute_forward_conv_transpose_2d
 static void ggml_compute_forward_conv_transpose_2d(
@ -12374,6 +12550,9 @@ static void ggml_compute_forward_conv_transpose_2d(
    GGML_ASSERT(nb10 == sizeof(float));
    if (params->type == GGML_TASK_INIT) {
        if (ith != 0) {
            return;
        }
        memset(params->wdata, 0, params->wsize);
        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@ -13917,6 +14096,14 @@ static void ggml_compute_forward_unary(
            {
                ggml_compute_forward_silu(params, src0, dst);
            } break;
        case GGML_UNARY_OP_HARDSWISH:
            {
                ggml_compute_forward_hardswish(params, src0, dst);
            } break;
        case GGML_UNARY_OP_HARDSIGMOID:
            {
                ggml_compute_forward_hardsigmoid(params, src0, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
@ -13980,6 +14167,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
    const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
    if (!inplace && params->type == GGML_TASK_INIT) {
        if (params->ith != 0) {
            return;
        }
        memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
        return;
    }
@ -16273,8 +16463,9 @@ struct ggml_compute_state_shared {
    const int n_threads;
    // synchronization primitives
-    atomic_int n_active; // num active threads
+    atomic_int n_active;  // num active threads
-    atomic_int node_n;   // active graph node
+    atomic_int node_n;    // active graph node
    atomic_int node_task; // active graph node task phase
    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
    void * abort_callback_data;
@ -16330,6 +16521,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_RELU:
                case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
                case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
                    {
                        n_tasks = 1;
                    } break;
@ -16520,6 +16713,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
    return n_tasks;
 }
 static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
    // wait for other threads to finish
    const int last_node_n = * node_n;
    while (true) {
        if (do_yield) {
            sched_yield();
        }
        * node_n = atomic_load(&state->shared->node_n);
        if (* node_n != last_node_n) break;
    }
 }
 static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
    // wait for other threads to finish
    const int last_task_phase = * task_phase;
    while (true) {
        if (do_yield) {
            sched_yield();
        }
        * task_phase = atomic_load(&state->shared->node_task);
        if (* task_phase != last_task_phase) break;
    }
 }
 static thread_ret_t ggml_graph_compute_thread(void * data) {
    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
@ -16530,7 +16751,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
    set_numa_thread_affinity(state->ith, n_threads);
-    int node_n = -1;
+    int node_n     = -1;
    int task_phase = GGML_TASK_FINALIZE;
    while (true) {
        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@ -16562,7 +16784,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
            // distribute new work or execute it direct if 1T
            while (++node_n < cgraph->n_nodes) {
                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
                struct ggml_tensor * node = cgraph->nodes[node_n];
                const int n_tasks = ggml_get_n_tasks(node, n_threads);
@ -16571,13 +16792,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                params.nth = n_tasks;
                /* INIT */
                if (GGML_OP_HAS_INIT[node->op]) {
                    params.type = GGML_TASK_INIT;
                    ggml_compute_forward(&params, node);
                }
                if (n_tasks == 1) {
                    /* INIT */
                    if (GGML_OP_HAS_INIT[node->op]) {
                        params.type = GGML_TASK_INIT;
                        ggml_compute_forward(&params, node);
                    }
                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
                    // they do something more efficient than spinning (?)
                    params.type = GGML_TASK_COMPUTE;
@ -16598,38 +16819,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                }
            }
-            atomic_store(&state->shared->n_active, n_threads);
+            task_phase = GGML_TASK_INIT;
-            atomic_store(&state->shared->node_n,   node_n);
+            atomic_store(&state->shared->n_active,  n_threads);
            atomic_store(&state->shared->node_n,    node_n);
            atomic_store(&state->shared->node_task, task_phase);
        } else {
-            // wait for other threads to finish
+            ggml_graph_compute_thread_sync_node(&node_n,     state, false);
-            const int last = node_n;
+            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
            const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
            while (true) {
                // TODO: this sched_yield can have significant impact on the performance - either positive or negative
                //       depending on the workload and the operating system.
                //       since it is not clear what is the best approach, it should potentially become user-configurable
                //       ref: https://github.com/ggerganov/ggml/issues/291
                // UPD:  adding the do_yield flag seems to resolve the issue universally
                if (do_yield) {
                    sched_yield();
                }
                node_n = atomic_load(&state->shared->node_n);
                if (node_n != last) break;
            };
        }
        // check if we should stop
        if (node_n >= cgraph->n_nodes) break;
-        /* COMPUTE */
+        /* INIT & COMPUTE */
        struct ggml_tensor * node = cgraph->nodes[node_n];
        const int n_tasks = ggml_get_n_tasks(node, n_threads);
        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_COMPUTE,
+            /*.type  =*/ GGML_TASK_INIT,
            /*.ith   =*/ state->ith,
            /*.nth   =*/ n_tasks,
            /*.wsize =*/ cplan->work_size,
@ -16637,8 +16844,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
        };
        if (state->ith < n_tasks) {
            if (GGML_OP_HAS_INIT[node->op]) {
                ggml_compute_forward(&params, node);
            }
        }
        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
            task_phase = GGML_TASK_COMPUTE;
            atomic_store(&state->shared->n_active,  n_threads);
            atomic_store(&state->shared->node_task, task_phase);
        }
        else {
            // TODO: this sched_yield can have significant impact on the performance - either positive or negative
            //       depending on the workload and the operating system.
            //       since it is not clear what is the best approach, it should potentially become user-configurable
            //       ref: https://github.com/ggerganov/ggml/issues/291
            // UPD:  adding the do_yield flag seems to resolve the issue universally
            const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
            ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
        }
        if (state->ith < n_tasks) {
            params.type = GGML_TASK_COMPUTE;
            ggml_compute_forward(&params, node);
        }
        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
            task_phase = GGML_TASK_FINALIZE;
            atomic_store(&state->shared->n_active,  n_threads);
            atomic_store(&state->shared->node_task, task_phase);
        }
        else {
            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
        }
    }
    return GGML_EXIT_SUCCESS;
@ -16695,8 +16933,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
                        if (node->src[0]->type != GGML_TYPE_F32) {
-                            // here we need memory just for single 2D matrix from src0
+                            // here we need memory for fully dequantized matrix from src0
-                            cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
+                            cur = ggml_type_size(GGML_TYPE_F32)*ggml_nelements(node->src[0]);
                        }
                    } else
 #endif
@ -16850,6 +17088,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
        /*.n_threads               =*/ n_threads,
        /*.n_active                =*/ n_threads,
        /*.node_n                  =*/ -1,
        /*.node_task               =*/ GGML_TASK_FINALIZE,
        /*.abort_callback          =*/ NULL,
        /*.abort_callback_data     =*/ NULL,
    };
--- a/ggml.h
+++ b/ggml.h
@ -489,6 +489,8 @@ extern "C" {
        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_GELU_QUICK,
        GGML_UNARY_OP_SILU,
        GGML_UNARY_OP_HARDSWISH,
        GGML_UNARY_OP_HARDSIGMOID,
        GGML_UNARY_OP_COUNT,
    };
@ -1032,6 +1034,16 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // hardswish(x) = x * relu6(x + 3) / 6
    GGML_API struct ggml_tensor * ggml_hardswish(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // hardsigmoid(x) = relu6(x + 3) / 6
    GGML_API struct ggml_tensor * ggml_hardsigmoid(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // normalize along rows
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
@ -1483,6 +1495,18 @@ extern "C" {
            int                  d1,
            bool                 is_2D);
    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            struct ggml_tensor  * c,
            int                  s0,
            int                  s1,
            int                  p0,
            int                  p1,
            int                  d0,
            int                  d1);
    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
--- a/llama.cpp
+++ b/llama.cpp
@ -1325,8 +1325,10 @@ static llama_state g_state;
 // available llama models
 enum e_model {
    MODEL_UNKNOWN,
    MODEL_0_5B,
    MODEL_1B,
    MODEL_3B,
    MODEL_4B,
    MODEL_7B,
    MODEL_8B,
    MODEL_13B,
@ -2659,6 +2661,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_Q6_K:   return "Q6_K";
        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
        case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
        default: return "unknown, may not work";
    }
@ -2874,6 +2877,7 @@ static void llm_load_hparams(
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                switch (hparams.n_layer) {
                    case 24: model.type = e_model::MODEL_1B; break;
                    case 32: model.type = e_model::MODEL_3B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
               }
@ -2892,9 +2896,9 @@ static void llm_load_hparams(
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
-                    case 24: model.type = e_model::MODEL_1B; break;
+                    case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 40: model.type = e_model::MODEL_13B; break;
+                    case 40: model.type = hparams.n_head == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
                    case 80: model.type = e_model::MODEL_70B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
@ -3697,6 +3701,11 @@ static bool llm_load_tensors(
                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
                        // optional bias tensors, present in Stable LM 2 1.6B
                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     false);
                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, false);
                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, false);
                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
@ -4315,6 +4324,7 @@ static struct ggml_tensor * llm_build_kqv(
          const llama_model & model,
        const llama_hparams & hparams,
       const llama_kv_cache & kv,
         struct ggml_cgraph * graph,
         struct ggml_tensor * wo,
         struct ggml_tensor * wo_b,
         struct ggml_tensor * q_cur,
@ -4393,6 +4403,8 @@ static struct ggml_tensor * llm_build_kqv(
    struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
    cb(cur, "kqv_merged_cont", il);
    ggml_build_forward_expand(graph, cur);
    cur = ggml_mul_mat(ctx, wo, cur);
    if (wo_b) {
        cb(cur, "kqv_wo", il);
@ -4405,6 +4417,44 @@ static struct ggml_tensor * llm_build_kqv(
    return cur;
 }
 static struct ggml_tensor * llm_build_kv(
        struct ggml_context * ctx,
          const llama_model & model,
        const llama_hparams & hparams,
       const llama_kv_cache & kv,
         struct ggml_cgraph * graph,
         struct ggml_tensor * wo,
         struct ggml_tensor * wo_b,
         struct ggml_tensor * k_cur,
         struct ggml_tensor * v_cur,
         struct ggml_tensor * q_cur,
         struct ggml_tensor * kq_mask,
                    int64_t   n_ctx,
                    int32_t   n_tokens,
                    int32_t   kv_head,
                    int32_t   n_kv,
                    float     max_alibi_bias,
                    float     kq_scale,
         const llm_build_cb & cb,
                    int       il) {
    // these nodes are added to the graph together so that they are not reordered
    // by doing so, the number of splits in the graph is reduced
    ggml_build_forward_expand(graph, k_cur);
    ggml_build_forward_expand(graph, v_cur);
    ggml_build_forward_expand(graph, q_cur);
    llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
    struct ggml_tensor * cur;
    cur  = llm_build_kqv(ctx, model, hparams, kv, graph,
            wo, wo_b,
            q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
    cb(cur, "kqv_out", il);
    return cur;
 }
 struct llm_build_context {
    const llama_model    & model;
    const llama_hparams  & hparams;
@ -4562,12 +4612,6 @@ struct llm_build_context {
                    cb(Vcur, "Vcur", il);
                }
                // these nodes are added to the graph together so that they are not reordered
                // by doing so, the number of splits in the graph is reduced
                ggml_build_forward_expand(gf, Qcur);
                ggml_build_forward_expand(gf, Kcur);
                ggml_build_forward_expand(gf, Vcur);
                Qcur = ggml_rope_custom(
                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
                    hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
@ -4582,11 +4626,9 @@ struct llm_build_context {
                );
                cb(Kcur, "Kcur", il);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -4763,14 +4805,13 @@ struct llm_build_context {
                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);
                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
                // apply ALiBi for 13B model
                const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
-                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -4892,11 +4933,9 @@ struct llm_build_context {
                );
                cb(Kcur, "Kcur", il);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -4993,11 +5032,9 @@ struct llm_build_context {
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5200,12 +5237,9 @@ struct llm_build_context {
                        );
                cb(Vcur, "Vcur", il);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                // TODO: not tested, could be broken
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
-                        Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5292,11 +5326,9 @@ struct llm_build_context {
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                cb(Qcur, "Qcur", il);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5390,11 +5422,9 @@ struct llm_build_context {
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5485,11 +5515,9 @@ struct llm_build_context {
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5576,12 +5604,24 @@ struct llm_build_context {
                // compute Q and K and RoPE them
                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
                cb(Qcur, "Qcur", il);
                if (model.layers[il].bq) {
                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                    cb(Qcur, "Qcur", il);
                }
                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
                cb(Kcur, "Kcur", il);
                if (model.layers[il].bk) {
                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                    cb(Kcur, "Kcur", il);
                }
                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
                cb(Vcur, "Vcur", il);
                if (model.layers[il].bv) {
                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                    cb(Vcur, "Vcur", il);
                }
                Qcur = ggml_rope_custom(
                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
@ -5597,11 +5637,9 @@ struct llm_build_context {
                );
                cb(Kcur, "Kcur", il);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5714,11 +5752,9 @@ struct llm_build_context {
                );
                cb(Kcur, "Kcur", il);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5837,11 +5873,9 @@ struct llm_build_context {
                );
                cb(Kcur, "Kcur", il);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5966,11 +6000,9 @@ struct llm_build_context {
                );
                cb(Kcur, "Kcur", il);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
                cb(cur, "kqv_out", il);
            }
@ -6071,11 +6103,9 @@ struct llm_build_context {
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Kcur, "Kcur", il);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, NULL,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
            struct ggml_tensor * sa_out = cur;
@ -6172,11 +6202,9 @@ struct llm_build_context {
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -6283,11 +6311,9 @@ struct llm_build_context {
                );
                cb(Kcur, "Kcur", il);
-                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -6355,6 +6381,14 @@ static struct ggml_cgraph * llama_build_graph(
            ggml_set_name(cur, name);
        }
        if (!lctx.cparams.offload_kqv) {
            if (strcmp(name, "kqv_merged_cont") == 0) {
                // all nodes between the KV store and the attention output are run on the CPU
                ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
            }
        }
        //
        // allocate input tensors and set input data
        //
@ -8750,9 +8784,13 @@ struct quantize_state_internal {
    const llama_model_quantize_params * params;
    int n_attention_wv    = 0;
-    int n_feed_forward_w2 = 0;
+    int n_ffn_down        = 0;
    int n_ffn_gate        = 0;
    int n_ffn_up          = 0;
    int i_attention_wv    = 0;
-    int i_feed_forward_w2 = 0;
+    int i_ffn_down        = 0;
    int i_ffn_gate        = 0;
    int i_ffn_up          = 0;
    int n_k_quantized     = 0;
    int n_fallback        = 0;
@ -8855,8 +8893,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
            ++qs.i_attention_wv;
        }
        else if (name.find("ffn_down") != std::string::npos) {
-            if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
+            if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
-            ++qs.i_feed_forward_w2;
+            ++qs.i_ffn_down;
        }
        else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
    } else if (name.find("attn_v.weight") != std::string::npos) {
@ -8893,18 +8931,21 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
            // TODO: explore better strategies
            new_type = GGML_TYPE_Q8_0;
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
            new_type = GGML_TYPE_Q2_K;
        }
    } else if (name.find("ffn_down") != std::string::npos) {
        const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
        int i_layer, n_layer;
        if (n_expert == 1) {
-            i_layer = qs.i_feed_forward_w2;
+            i_layer = qs.i_ffn_down;
-            n_layer = qs.n_feed_forward_w2;
+            n_layer = qs.n_ffn_down;
        } else {
            // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
-            // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
+            // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
            // for getting the current layer as I initially thought, and we need to resort to parsing the
            // tensor name.
-            n_layer = qs.n_feed_forward_w2 / n_expert;
+            n_layer = qs.n_ffn_down / n_expert;
            if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
                throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
            }
@ -8913,7 +8954,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
            }
        }
        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
            if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@ -8943,11 +8984,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
            // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
            new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
        }
-        ++qs.i_feed_forward_w2;
+        ++qs.i_ffn_down;
    } else if (name.find("attn_output.weight") != std::string::npos) {
        if (arch != LLM_ARCH_FALCON) {
            if (qs.model.hparams.n_expert == 8) {
-                if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
+                if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
                    ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
                    ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
                    new_type = GGML_TYPE_Q5_K;
                }
@ -8965,6 +9007,20 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
    }
    else if (name.find("ffn_gate") != std::string::npos) {
        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_gate, qs.n_ffn_gate)) {
            new_type = GGML_TYPE_Q2_K;
        }
        ++qs.i_ffn_gate;
    }
    else if (name.find("ffn_up") != std::string::npos) {
        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_up, qs.n_ffn_up)) {
            new_type = GGML_TYPE_Q2_K;
        }
        ++qs.i_ffn_up;
    }
    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
    //}
    // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
    //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@ -9019,8 +9075,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        case LLAMA_FTYPE_ALL_F32:     quantized_type = GGML_TYPE_F32;  break;
        // K-quants
        case LLAMA_FTYPE_MOSTLY_Q2_K_S:
        case LLAMA_FTYPE_MOSTLY_Q2_K:   quantized_type = GGML_TYPE_Q2_K; break;
-        case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
+        case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
        case LLAMA_FTYPE_MOSTLY_Q3_K_S:
        case LLAMA_FTYPE_MOSTLY_Q3_K_M:
        case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@ -9088,12 +9145,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            ++qs.n_attention_wv;
        }
        else if (name.find("ffn_down") != std::string::npos) {
-            ++qs.n_feed_forward_w2;
+            ++qs.n_ffn_down;
        }
        else if (name.find("ffn_gate") != std::string::npos) {
            ++qs.n_ffn_gate;
        }
        else if (name.find("ffn_up") != std::string::npos) {
            ++qs.n_ffn_up;
        }
    }
-    if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
+    if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
-        LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
+        LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
-                __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
+                __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
    }
    size_t total_size_org = 0;
--- a/llama.h
+++ b/llama.h
@ -107,6 +107,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q3_K_XS       = 22, // except 1d tensors
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
--- a/mypy.ini
+++ b/mypy.ini
@ -4,3 +4,4 @@ allow_untyped_calls = true
 allow_untyped_defs = true
 allow_incomplete_defs = true
 disable_error_code = import-untyped
 warn_return_any = false
--- a/unicode.h
+++ b/unicode.h
@ -2,8 +2,9 @@
 #include <cassert>
 #include <stdexcept>
-#include <vector>
+#include <string>
 #include <unordered_map>
 #include <vector>
 static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
 {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},