Merge branch 'gguf' of https://github.com/goerch/llama.cpp into gguf

2023-08-21 18:48:23 +02:00 · 2023-08-21 18:48:23 +02:00 · a856685648
commit a856685648
parent 44dd9ed287 0b53b8b08d
17 changed files with 765 additions and 309 deletions
--- a/README.md
+++ b/README.md
@ -9,13 +9,13 @@

 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

-**Hot topics:**
+### 🚧 Incoming breaking change + refactoring:

- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
- New roadmap: https://github.com/users/ggerganov/projects/7
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
+See PR https://github.com/ggerganov/llama.cpp/pull/2398 for more info.
+
+To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
+
+----

 <details>
  <summary>Table of Contents</summary>
@ -99,6 +99,7 @@ as the main playground for developing new features for the [ggml](https://github
 - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
+- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)

 **UI:**

--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@ -118,6 +118,11 @@ gguf_writer.add_head_count(head_count)
 gguf_writer.add_head_count_kv(head_count_kv)
 gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])

+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+

 # TOKENIZATION

@ -147,9 +152,7 @@ if Path(dir_model + "/tokenizer.model").is_file():
        if tokenizer.is_control(i):
            toktype = 3

-        # TODO: How to determinate if a token is user defined?
-        # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
-        # if tokenizer.is_user_defined(i): toktype = 4
+        # toktype = 4 is user-defined = tokens from added_tokens.json

        if tokenizer.is_unused(i):
            toktype = 5
@ -160,6 +163,17 @@ if Path(dir_model + "/tokenizer.model").is_file():
        scores.append(score)
        toktypes.append(toktype)

+    if Path(dir_model + "/added_tokens.json").is_file():
+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+            addtokens_json = json.load(f)
+
+            print("gguf: get added tokens")
+
+            for key in addtokens_json:
+                tokens.append( key.encode("utf-8") )
+                scores.append(-1000.0)
+                toktypes.append(4) # user-defined token type
+
    gguf_writer.add_tokenizer_model("llama")
    gguf_writer.add_token_list(tokens)
    gguf_writer.add_token_scores(scores)
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@ -0,0 +1,334 @@
+import sys, struct, math, argparse
+from pathlib import Path
+
+import numpy as np
+
+import gguf
+
+# Note: Does not support GGML_QKK_64
+QK_K = 256
+# Items here are (block size, type size)
+GGML_QUANT_SIZES = {
+    gguf.GGMLQuantizationType.F32  : (1, 4),
+    gguf.GGMLQuantizationType.F16  : (1, 2),
+    gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
+    gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
+    gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
+    gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
+    gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
+    gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
+    gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
+    gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
+    gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
+    gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
+    gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
+    gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
+}
+
+class Hyperparameters:
+    def __init__(self):
+        self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
+        self.n_ff = 0
+
+    def set_n_ff(self, model):
+        ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
+        assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
+        ff_tensor = model.tensors[ff_tensor_idx]
+        self.n_ff = ff_tensor.dims[1]
+
+    def load(self, data, offset):
+        (
+            self.n_vocab,
+            self.n_embd,
+            self.n_mult,
+            self.n_head,
+            self.n_layer,
+            self.n_rot,
+            self.ftype,
+        ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
+        return 4 * 7
+
+    def __str__(self):
+        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
+
+class Vocab:
+    def __init__(self):
+        self.items = []
+
+    def load(self, data, offset, n_vocab):
+        orig_offset = offset
+        for _ in range(n_vocab):
+            itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
+            assert itemlen < 4096, 'Absurd vocab item length'
+            offset += 4
+            vocab = bytes(data[offset:offset + itemlen])
+            offset += itemlen
+            score = struct.unpack('<f', data[offset:offset + 4])[0]
+            offset += 4
+            self.items.append((vocab, score))
+        return offset - orig_offset
+
+class Tensor:
+    def __init__(self):
+        self.name = None
+        self.dims = ()
+        self.dtype = None
+        self.start_offset = 0
+        self.len_bytes = 0
+
+    def load(self, data, offset):
+        orig_offset = offset
+        (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
+        assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
+        assert name_len < 4096, 'Absurd tensor name length'
+        quant = GGML_QUANT_SIZES.get(dtype)
+        assert quant is not None, 'Unknown tensor type'
+        (blksize, tysize) = quant
+        offset += 12
+        self.dtype= dtype
+        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
+        offset += 4 * n_dims
+        self.name = bytes(data[offset:offset + name_len])
+        offset += name_len
+        pad = ((offset + 31) & ~31) - offset
+        offset += pad
+        n_elems = np.prod(self.dims)
+        n_bytes = (n_elems * tysize) // blksize
+        self.start_offset = offset
+        self.len_bytes = n_bytes
+        offset += n_bytes
+        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
+        return offset - orig_offset
+
+class GGMLV3Model:
+    def __init__(self):
+        self.hyperparameters = None
+        self.vocab = None
+        self.tensor_map = {}
+        self.tensors = []
+
+    def validate_header(self, data, offset):
+        if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
+            raise ValueError('Only GGJTv3 supported')
+        return 8
+
+    def load(self, data, offset):
+        offset += self.validate_header(data, offset)
+        hp = Hyperparameters()
+        offset += hp.load(data, offset)
+        vocab = Vocab()
+        offset += vocab.load(data, offset, hp.n_vocab)
+        tensors = []
+        tensor_map = {}
+        while offset < len(data):
+            tensor = Tensor()
+            offset += tensor.load(data, offset)
+            tensor_map[tensor.name] = len(tensors)
+            tensors.append(tensor)
+        self.hyperparameters = hp
+        self.vocab = vocab
+        self.tensors = tensors
+        self.tensor_map = tensor_map
+        hp.set_n_ff(self)
+        return offset
+
+class GGMLToGGUF:
+    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
+        hp = ggml_model.hyperparameters
+        self.model = ggml_model
+        self.data = data
+        self.cfg = cfg
+        self.params_override = params_override
+        self.vocab_override = vocab_override
+        if params_override is not None:
+            n_kv_head = params_override.n_head_kv
+        else:
+            if cfg.gqa == 1:
+                n_kv_head = hp.n_head
+            else:
+                gqa = float(cfg.gqa)
+                n_kv_head = None
+                for x in range(1, 256):
+                    if float(hp.n_head) / float(x) == gqa:
+                        n_kv_head = x
+                assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
+                print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
+        self.n_kv_head = n_kv_head
+        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
+
+    def save(self):
+        print('* Preparing to save GGUF file')
+        gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
+        self.add_params(gguf_writer)
+        self.add_vocab(gguf_writer)
+        self.add_tensors(gguf_writer)
+        print("    gguf: write header")
+        gguf_writer.write_header_to_file()
+        print("    gguf: write metadata")
+        gguf_writer.write_kv_data_to_file()
+        print("    gguf: write tensors")
+        gguf_writer.write_tensors_to_file()
+        gguf_writer.close()
+
+    def add_params(self, gguf_writer):
+        hp = self.model.hyperparameters
+        cfg = self.cfg
+        desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
+        try:
+            # Filenames aren't necessarily valid UTF8.
+            name = cfg.name if cfg.name is not None else cfg.input.name
+        except UnicodeDecodeError:
+            name = None
+        print('* Adding model parameters and KV items')
+        if name is not None:
+            gguf_writer.add_name(name)
+        gguf_writer.add_description(desc)
+        if self.params_override is not None:
+            po = self.params_override
+            assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
+            assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
+            assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
+            gguf_writer.add_context_length      (po.n_ctx)
+            gguf_writer.add_embedding_length    (po.n_embd)
+            gguf_writer.add_block_count         (po.n_layer)
+            gguf_writer.add_feed_forward_length (po.n_ff)
+            gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
+            gguf_writer.add_head_count          (po.n_head)
+            gguf_writer.add_head_count_kv       (po.n_head_kv)
+            gguf_writer.add_layer_norm_rms_eps  (po.f_norm_eps)
+            return
+        gguf_writer.add_context_length(cfg.context_length)
+        gguf_writer.add_embedding_length(hp.n_embd)
+        gguf_writer.add_block_count(hp.n_layer)
+        gguf_writer.add_feed_forward_length(hp.n_ff)
+        gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
+        gguf_writer.add_head_count(hp.n_head)
+        gguf_writer.add_head_count_kv(self.n_kv_head)
+        gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
+
+    def add_vocab(self, gguf_writer):
+        hp = self.model.hyperparameters
+        gguf_writer.add_tokenizer_model('llama')
+        tokens = []
+        scores = []
+        toktypes = []
+        if self.vocab_override is not None:
+            vo = self.vocab_override
+            print('* Adding vocab item(s)')
+            for (idx, vitem) in enumerate(vo.all_tokens()):
+                if len(vitem) == 3:
+                    tokens.append(vitem[0])
+                    scores.append(vitem[1])
+                    toktypes.append(vitem[2])
+                else:
+                    # Maybe try to guess the token type here?
+                    tokens.append(vitem[0])
+                    scores.append(vitem[1])
+            assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
+            gguf_writer.add_token_list(tokens)
+            gguf_writer.add_token_scores(scores)
+            if len(toktypes) > 0:
+                gguf_writer.add_token_types(toktypes)
+            return
+        print(f'* Adding {hp.n_vocab} vocab item(s)')
+        for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
+            tt = 1 # Normal
+            if len(vbytes) == 0:
+                tt = 3 # Control
+            elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
+                hv = hex(vbytes[0])[2:].upper()
+                vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
+                tt = 6 # Byte
+            else:
+                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
+            toktypes.append(tt)
+            tokens.append(vbytes)
+            scores.append(vscore)
+        gguf_writer.add_token_list(tokens)
+        gguf_writer.add_token_scores(scores)
+        gguf_writer.add_token_types(toktypes)
+
+    def add_tensors(self, gguf_writer):
+        nm = self.name_map
+        data = self.data
+        print(f'* Adding {len(self.model.tensors)} tensor(s)')
+        for tensor in self.model.tensors:
+            name = str(tensor.name, 'UTF-8')
+            if name.endswith('.weight'):
+                name = name[:-7]
+                suffix = '.weight'
+            elif name.endswith('.bias'):
+                name = name[:-5]
+                suffix = '.bias'
+            mapped_name = nm.get(name)
+            assert mapped_name is not None, f'Bad name {name}'
+            mapped_name += suffix
+            tempdims = list(tensor.dims[:])
+            if len(tempdims) > 1:
+                temp = tempdims[1]
+                tempdims[1] = tempdims[0]
+                tempdims[0] = temp
+            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
+            gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
+
+def handle_metadata(cfg, hp):
+    import convert
+    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
+    hf_config_path   = cfg.model_metadata_dir / "config.json"
+    orig_config_path = cfg.model_metadata_dir / "params.json"
+    # We pass a fake model here. "original" mode will check the shapes of some
+    # tensors if information is missing in the .json file: other than that, the
+    # model data isn't used so this should be safe (at least for now).
+    fakemodel = {
+        'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
+        'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
+    }
+    fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
+    fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
+    if hf_config_path.exists():
+        params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
+    elif orig_config_path.exists():
+        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
+    else:
+        raise ValueError('Unable to load metadata')
+    vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
+    convert.check_vocab_size(params, vocab)
+    return (params, vocab)
+
+def handle_args():
+    parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
+    parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
+    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
+    parser.add_argument('--name', help = 'Set model name')
+    parser.add_argument('--desc', help = 'Set model description')
+    parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
+    parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
+    parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
+    parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
+    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
+    parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
+    return parser.parse_args()
+
+def main():
+    cfg = handle_args()
+    print(f'* Using config: {cfg}')
+    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
+    data = np.memmap(cfg.input, mode = 'r')
+    model = GGMLV3Model()
+    print('* Scanning GGML input file')
+    offset = model.load(data, 0)
+    print(f'* GGML model hyperparameters: {model.hyperparameters}')
+    vocab_override = None
+    params_override = None
+    if cfg.model_metadata_dir is not None:
+        (params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
+        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
+        print(f'* Overriding params: {params_override}')
+        print(f'* Overriding vocab: {vocab_override}')
+    else:
+        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
+    converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
+    converter.save()
+    print(f'* Successful completion. Output saved to: {cfg.output}')
+
+main()
--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@ -126,6 +126,11 @@ gguf_writer.add_head_count(head_count)
 gguf_writer.add_head_count_kv(head_count_kv)
 gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])

+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+

 # TOKENIZATION

@ -155,9 +160,7 @@ if Path(dir_model + "/tokenizer.model").is_file():
        if tokenizer.is_control(i):
            toktype = 3

-        # TODO: How to determinate if a token is user defined?
-        # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
-        # if tokenizer.is_user_defined(i): toktype = 4
+        # toktype = 4 is user-defined = tokens from added_tokens.json

        if tokenizer.is_unused(i):
            toktype = 5
@ -168,6 +171,18 @@ if Path(dir_model + "/tokenizer.model").is_file():
        scores.append(score)
        toktypes.append(toktype)

+    if Path(dir_model + "/added_tokens.json").is_file():
+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+            addtokens_json = json.load(f)
+
+            print("gguf: get added tokens")
+
+            for key in addtokens_json:
+                tokens.append( key.encode("utf-8") )
+                scores.append(-1000.0)
+                toktypes.append(4) # user-defined token type
+
+
    gguf_writer.add_tokenizer_model("llama")
    gguf_writer.add_token_list(tokens)
    gguf_writer.add_token_scores(scores)
@ -264,7 +279,9 @@ for part_name in part_names:
        data = data.squeeze().numpy()

        # reverse permute these
-        if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
+        if name.endswith(".q_proj.weight"):
+            data = reverse_hf_permute(data, head_count)
+        if name.endswith(".k_proj.weight"):
            data = reverse_hf_permute(data, head_count, head_count_kv)

        # map tensor names
--- a/convert.py
+++ b/convert.py
@ -241,11 +241,13 @@ class BpeVocab:
            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
        else:
            added_tokens = {}
+
        vocab_size: int = len(self.bpe_tokenizer)
        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids      = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+
        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
        self.added_tokens_list    = [text for (text, idx) in items]
        self.vocab_size_base: int = vocab_size
@ -261,12 +263,12 @@ class BpeVocab:
        for i, item in enumerate(tokenizer):
            text: bytes = item.encode("utf-8")
            score: float = -i
-            yield text, score, 4
+            yield text, score, gguf.TokenType.USER_DEFINED

    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
        for text in self.added_tokens_list:
            score = -1000.0
-            yield text.encode("utf-8"), score, 4
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED

    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
        yield from self.bpe_tokens()
@ -304,27 +306,27 @@ class SentencePieceVocab:
            text: bytes = piece.encode("utf-8")
            score: float = tokenizer.get_score(i)

-            toktype = 1  # defualt to normal token type
+            toktype = gguf.TokenType.NORMAL
            if tokenizer.is_unknown(i):
-                toktype = 2
+                toktype = gguf.TokenType.UNKNOWN
            if tokenizer.is_control(i):
-                toktype = 3
+                toktype = gguf.TokenType.CONTROL

            # NOTE: I think added_tokens are user defined.
            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
-            # if tokenizer.is_user_defined(i): toktype = 4
+            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED

            if tokenizer.is_unused(i):
-                toktype = 5
+                toktype = gguf.TokenType.UNUSED
            if tokenizer.is_byte(i):
-                toktype = 6
+                toktype = gguf.TokenType.BYTE

            yield text, score, toktype

    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
        for text in self.added_tokens_list:
            score = -1000.0
-            yield text.encode("utf-8"), score, 4
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED

    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
        yield from self.sentencepiece_tokens()
@ -342,6 +344,7 @@ Vocab = Union[BpeVocab, SentencePieceVocab]
 #

 def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
+    #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
    if n_head_kv is not None and n_head != n_head_kv:
        n_head //= n_head_kv
    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@ -724,6 +727,7 @@ class OutputFile:
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])

    def add_meta_arch(self, params: Params) -> None:
+        self.gguf.add_name                ("llama")
        self.gguf.add_context_length      (params.n_ctx)
        self.gguf.add_embedding_length    (params.n_embd)
        self.gguf.add_block_count         (params.n_layer)
@ -836,12 +840,12 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
    for i in itertools.count():
        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
            print(f"Permuting layer {i}")
-            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head_kv)
+            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
           #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
            print(f"Unpacking and permuting layer {i}")
-            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head_kv)
+            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
            tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy        (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
        else:
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -139,14 +139,16 @@ void print_sample_weights(TransformerWeights *w){
 struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
+    using ttype = llama_token_type;

-    struct token_score {
-        token tok;
+    struct token_data {
+        token text;
        float score;
+        ttype type;
    };

    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
+    std::vector<token_data> id_to_token;
 };

 struct my_llama_hparams {
@ -516,36 +518,30 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
        struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
        struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);

-        std::vector<const char *> strings;
-        std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);
-        strings.resize(n_vocab, NULL);
-        scores.resize(n_vocab, 0);
-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        const int n_vocab = llama_n_vocab(lctx);
        vocab->id_to_token.resize(n_vocab);
        for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
-            float       score = scores[i];
-            vocab->id_to_token[i].tok   = tok;
-            vocab->id_to_token[i].score = score;
-            vocab->token_to_id.emplace(tok, i);
+            vocab->id_to_token[i].text  = llama_token_get_text(lctx, i);
+            vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
+            vocab->id_to_token[i].type  = llama_token_get_type(lctx, i);
+            vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
        }
        llama_free(lctx);
        llama_free_model(lmodel);
    } else { // assume llama2.c vocabulary
        printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
        llama_file file(filename, "rb");
-        uint32_t n_vocab = config->vocab_size;
+        const int  n_vocab = config->vocab_size;
        /* uint32_t max_token_length =  */ file.read_u32(); // unused
        vocab->id_to_token.resize(n_vocab);
-        for (uint32_t i=0; i<n_vocab; ++i) {
+        for (int i=0; i<n_vocab; ++i) {
            float_t score = file.read_f32();
            uint32_t len = file.read_u32();
-            std::string tok = file.read_string(len);
-            vocab->id_to_token[i].tok = tok;
+            std::string text = file.read_string(len);
+            vocab->id_to_token[i].text = text;
            vocab->id_to_token[i].score = score;
-            vocab->token_to_id.emplace(tok, i);
+            vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
+            vocab->token_to_id.emplace(text, i);
        }
    }
 }
@ -611,10 +607,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
 //    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
 //    uint32_t n_vocab = model->hparams.n_vocab;
 //    for (uint32_t i = 0; i < n_vocab; i++) {
-//        const auto & token_score = vocab->id_to_token.at(i);
-//        file.write_u32((uint32_t) token_score.tok.size());
-//        file.write_raw(token_score.tok.data(), token_score.tok.size());
-//        file.write_raw(&token_score.score, sizeof(token_score.score));
+//        const auto & token_data = vocab->id_to_token.at(i);
+//        file.write_u32((uint32_t) token_data.tok.size());
+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
+//        file.write_raw(&token_data.score, sizeof(token_data.score));
 //    }
 //
 //    // stuff AK weights into GG weights one by one.
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -5,6 +5,7 @@
 #include <cmath>
 #include <ctime>
 #include <sstream>
+#include <cstring>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -121,6 +122,27 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    printf("\n");
 }

+std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
+        int n_vocab, int n_thread) {
+    std::vector<float> result;
+    result.reserve(tokens.size() * n_vocab);
+    size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
+    for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
+        size_t n_tokens = tokens.size() - i_chunk * n_batch;
+        n_tokens = std::min(n_tokens, size_t(n_batch));
+        if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return {};
+        }
+
+        const auto logits = llama_get_logits(ctx);
+        result.insert(result.end(), logits, logits + n_tokens * n_vocab);
+
+        n_past += n_tokens;
+    }
+    return result;
+}
+
 void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    // Calculates hellaswag score (acc_norm) from prompt
    //
@ -209,17 +231,19 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    double acc = 0.0f;
    const int n_vocab = llama_n_vocab(ctx);

+    std::vector<float> tok_logits(n_vocab);
+
    for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {

        // Tokenize the context to count tokens
        std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
        size_t context_size = context_embd.size();

-        for (size_t ending_idx=0;ending_idx<4;ending_idx++) {
-
-            // Tokenize the query
-            std::vector<int> query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos);
-            size_t query_size = query_embd.size();
+        // Do the 1st ending
+        // In this case we include the context when evaluating
+        auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
+        auto query_size = query_embd.size();
+        //printf("First query: %d\n",(int)query_size);

        // Stop if query wont fit the ctx window
        if (query_size > (size_t)params.n_ctx) {
@ -232,25 +256,66 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            query_embd.resize(32);
        }

-            // Evaluate the query
-            if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
+        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
+        if (logits.empty()) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return;
        }

-            const auto query_logits = llama_get_logits(ctx);
-            std::vector<float> logits;
-            logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab);
+        std::memcpy(tok_logits.data(), logits.data() + (context_size-1)*n_vocab, n_vocab*sizeof(float));
+        const auto first_probs = softmax(tok_logits);

-            hs_data[task_idx].ending_logprob_count[ending_idx] = 0;
-            hs_data[task_idx].ending_logprob[ending_idx] = 0.0f;
+        hs_data[task_idx].ending_logprob_count[0] = 1;
+        hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);

        // Calculate the logprobs over the ending
-            for (size_t j = context_size-1; j < query_size - 1; j++) {
-                // Calculate probability of next token, given the previous ones.
-                const std::vector<float> tok_logits(
-                    logits.begin() + (j + 0) * n_vocab,
-                    logits.begin() + (j + 1) * n_vocab);
+        for (size_t j = context_size; j < query_size - 1; j++) {
+
+            std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
+
+            const float prob = softmax(tok_logits)[query_embd[j + 1]];
+
+            hs_data[task_idx].ending_logprob[0] += std::log(prob);
+            hs_data[task_idx].ending_logprob_count[0]++;
+        }
+
+        // Calculate the mean token logprob for acc_norm
+        hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];
+
+        // Do the remaining endings
+        // For these, we use the bare ending with n_past = context_size
+        //
+        for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
+
+            // Tokenize the query
+            query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
+            query_size = query_embd.size();
+
+            // Stop if query wont fit the ctx window
+            if (context_size + query_size > (size_t)params.n_ctx) {
+                fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
+                return;
+            }
+
+            // Speedup small evaluations by evaluating atleast 32 tokens
+            // No, resizing to 32 is actually slightly slower (at least on CUDA)
+            //if (query_size < 32) {
+            //    query_embd.resize(32);
+            //}
+
+            // Evaluate the query
+            logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads);
+            if (logits.empty()) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return;
+            }
+
+            hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
+            hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
+
+            // Calculate the logprobs over the ending
+            for (size_t j = 0; j < query_size - 1; j++) {
+                std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));

                const float prob = softmax(tok_logits)[query_embd[j + 1]];

@ -267,9 +332,9 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        }

        // Find the ending with maximum logprob
-        size_t ending_logprob_max_idx = -1;
-        double ending_logprob_max_val = -INFINITY;
-        for (size_t j=0; j < 4; j++) {
+        size_t ending_logprob_max_idx = 0;
+        double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
+        for (size_t j = 1; j < 4; j++) {
            if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
                ending_logprob_max_idx = j;
                ending_logprob_max_val =  hs_data[task_idx].ending_logprob[j];
--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@ -11,8 +11,10 @@ echo >> $PUBLIC/index.js # add newline

 FILES=$(ls $PUBLIC)

+cd $PUBLIC
 for FILE in $FILES; do
-  func=$(echo $FILE | tr '.' '_')
-  echo "generate $FILE.hpp ($func)"
-  xxd -n $func -i $PUBLIC/$FILE > $DIR/$FILE.hpp
+  echo "generate $FILE.hpp"
+
+  # use simple flag for old version of xxd
+  xxd -i $FILE > $DIR/$FILE.hpp
 done
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -144,12 +144,12 @@
    import { SchemaConverter } from '/json-schema-to-grammar.mjs';

    const session = signal({
-      prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
+      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
      historyTemplate: "{{name}}: {{message}}",
      transcript: [],
      type: "chat",
-      char: "llama",
+      char: "Llama",
      user: "User",
    })

--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -170,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
 struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
+    using ttype = llama_token_type;

-    struct token_score {
-        token tok;
+    struct token_data {
+        token text;
        float score;
+        ttype type;
    };

    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
+    std::vector<token_data> id_to_token;
 };

 struct my_llama_hparams {
@ -2629,10 +2631,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
 //    // write_vocab
 //    uint32_t n_vocab = model->hparams.n_vocab;
 //    for (uint32_t i = 0; i < n_vocab; i++) {
-//        const auto & token_score = vocab->id_to_token.at(i);
-//        file.write_u32((uint32_t) token_score.tok.size());
-//        file.write_raw(token_score.tok.data(), token_score.tok.size());
-//        file.write_raw(&token_score.score, sizeof(token_score.score));
+//        const auto & token_data = vocab->id_to_token.at(i);
+//        file.write_u32((uint32_t) token_data.tok.size());
+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
+//        file.write_raw(&token_data.score, sizeof(token_data.score));
 //    }
 //    // write tensors
 //    write_tensor(&file, model->tok_embeddings);
@ -3055,20 +3057,13 @@ int main(int argc, char ** argv) {

    struct llama_vocab vocab;
    {
-        std::vector<const char *> strings;
-        std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);
-        strings.resize(n_vocab, NULL);
-        scores.resize(n_vocab, 0);
-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        const int n_vocab = llama_n_vocab(lctx);
        vocab.id_to_token.resize(n_vocab);
        for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
-            float       score = scores[i];
-            vocab.id_to_token[i].tok   = tok;
-            vocab.id_to_token[i].score = score;
-            vocab.token_to_id.emplace(tok, i);
+            vocab.id_to_token[i].text  = llama_token_get_text(lctx, i);
+            vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
+            vocab.id_to_token[i].type  = llama_token_get_type(lctx, i);
+            vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
        }
    }

--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const  uchar * src0,
        threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
        for (int i = 0; i < 8; i++) {
+            threadgroup_barrier(mem_flags::mem_device);
            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
        }

-        threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup_barrier(mem_flags::mem_device);
        device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
        if (sgitg==0) {
            for (int i = 0; i < n_rows; i++) {
--- a/ggml.c
+++ b/ggml.c
@ -1643,11 +1643,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
 static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);

 static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_I8] = {
+        .type_name                = "i8",
+        .blck_size                = 1,
+        .type_size                = sizeof(int8_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_I16] = {
+        .type_name                = "i16",
+        .blck_size                = 1,
+        .type_size                = sizeof(int16_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_I32] = {
+        .type_name                = "i32",
+        .blck_size                = 1,
+        .type_size                = sizeof(int32_t),
+        .is_quantized             = false,
+    },
    [GGML_TYPE_F32] = {
+        .type_name                = "f32",
+        .blck_size                = 1,
+        .type_size                = sizeof(float),
+        .is_quantized             = false,
        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
        .vec_dot_type             = GGML_TYPE_F32,
    },
    [GGML_TYPE_F16] = {
+        .type_name                = "f16",
+        .blck_size                = 1,
+        .type_size                = sizeof(ggml_fp16_t),
+        .is_quantized             = false,
        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
@ -1655,6 +1681,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_F16,
    },
    [GGML_TYPE_Q4_0] = {
+        .type_name                = "q4_0",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
        .from_float               = quantize_row_q4_0,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
@ -1662,6 +1692,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_0,
    },
    [GGML_TYPE_Q4_1] = {
+        .type_name                = "q4_1",
+        .blck_size                = QK4_1,
+        .type_size                = sizeof(block_q4_1),
+        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
        .from_float               = quantize_row_q4_1,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
@ -1669,6 +1703,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_1,
    },
    [GGML_TYPE_Q5_0] = {
+        .type_name                = "q5_0",
+        .blck_size                = QK5_0,
+        .type_size                = sizeof(block_q5_0),
+        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
        .from_float               = quantize_row_q5_0,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
@ -1676,6 +1714,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_0,
    },
    [GGML_TYPE_Q5_1] = {
+        .type_name                = "q5_1",
+        .blck_size                = QK5_1,
+        .type_size                = sizeof(block_q5_1),
+        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
        .from_float               = quantize_row_q5_1,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
@ -1683,6 +1725,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_1,
    },
    [GGML_TYPE_Q8_0] = {
+        .type_name                = "q8_0",
+        .blck_size                = QK8_0,
+        .type_size                = sizeof(block_q8_0),
+        .is_quantized             = true,
        .to_float                 = dequantize_row_q8_0,
        .from_float               = quantize_row_q8_0,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
@ -1690,12 +1736,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_0,
    },
    [GGML_TYPE_Q8_1] = {
+        .type_name                = "q8_1",
+        .blck_size                = QK8_1,
+        .type_size                = sizeof(block_q8_1),
+        .is_quantized             = true,
        .from_float               = quantize_row_q8_1,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
        .vec_dot_type             = GGML_TYPE_Q8_1,
    },
 #ifdef GGML_USE_K_QUANTS
    [GGML_TYPE_Q2_K] = {
+        .type_name                = "q2_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q2_K),
+        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
        .from_float               = quantize_row_q2_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
@ -1703,6 +1757,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_K,
    },
    [GGML_TYPE_Q3_K] = {
+        .type_name                = "q3_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q3_K),
+        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
        .from_float               = quantize_row_q3_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
@ -1710,6 +1768,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_K,
    },
    [GGML_TYPE_Q4_K] = {
+        .type_name                = "q4_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q4_K),
+        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
        .from_float               = quantize_row_q4_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
@ -1717,6 +1779,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_K,
    },
    [GGML_TYPE_Q5_K] = {
+        .type_name                = "q5_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q5_K),
+        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
        .from_float               = quantize_row_q5_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
@ -1724,6 +1790,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_K,
    },
    [GGML_TYPE_Q6_K] = {
+        .type_name                = "q6_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q6_K),
+        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
        .from_float               = quantize_row_q6_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
@ -1731,15 +1801,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_K,
    },
    [GGML_TYPE_Q8_K] = {
+        .type_name                = "q8_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q8_K),
+        .is_quantized             = true,
        .from_float               = quantize_row_q8_K,
    }
 #endif
 };

 // For internal test use
-ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
-    GGML_ASSERT(i < GGML_TYPE_COUNT);
-    return type_traits[i];
+ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
+    GGML_ASSERT(type < GGML_TYPE_COUNT);
+    return type_traits[type];
 }


@ -3648,98 +3722,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
    *s = idx;
 }

-//
-// data types
-//
-
-static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = 1,
-    [GGML_TYPE_F16]  = 1,
-    [GGML_TYPE_Q4_0] = QK4_0,
-    [GGML_TYPE_Q4_1] = QK4_1,
-    [GGML_TYPE_Q5_0] = QK5_0,
-    [GGML_TYPE_Q5_1] = QK5_1,
-    [GGML_TYPE_Q8_0] = QK8_0,
-    [GGML_TYPE_Q8_1] = QK8_1,
-#ifdef GGML_USE_K_QUANTS
-    [GGML_TYPE_Q2_K] = QK_K,
-    [GGML_TYPE_Q3_K] = QK_K,
-    [GGML_TYPE_Q4_K] = QK_K,
-    [GGML_TYPE_Q5_K] = QK_K,
-    [GGML_TYPE_Q6_K] = QK_K,
-    [GGML_TYPE_Q8_K] = QK_K,
-#endif
-    [GGML_TYPE_I8]   = 1,
-    [GGML_TYPE_I16]  = 1,
-    [GGML_TYPE_I32]  = 1,
-};
-static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
-
-static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = sizeof(float),
-    [GGML_TYPE_F16]  = sizeof(ggml_fp16_t),
-    [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
-    [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
-    [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
-    [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
-    [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
-    [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
-#ifdef GGML_USE_K_QUANTS
-    [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
-    [GGML_TYPE_Q3_K] = sizeof(block_q3_K),
-    [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
-    [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
-    [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
-    [GGML_TYPE_Q8_K] = sizeof(block_q8_K),
-#endif
-    [GGML_TYPE_I8]   = sizeof(int8_t),
-    [GGML_TYPE_I16]  = sizeof(int16_t),
-    [GGML_TYPE_I32]  = sizeof(int32_t),
-};
-static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
-
-static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = "f32",
-    [GGML_TYPE_F16]  = "f16",
-    [GGML_TYPE_Q4_0] = "q4_0",
-    [GGML_TYPE_Q4_1] = "q4_1",
-    [GGML_TYPE_Q5_0] = "q5_0",
-    [GGML_TYPE_Q5_1] = "q5_1",
-    [GGML_TYPE_Q8_0] = "q8_0",
-    [GGML_TYPE_Q8_1] = "q8_1",
-    [GGML_TYPE_Q2_K] = "q2_K",
-    [GGML_TYPE_Q3_K] = "q3_K",
-    [GGML_TYPE_Q4_K] = "q4_K",
-    [GGML_TYPE_Q5_K] = "q5_K",
-    [GGML_TYPE_Q6_K] = "q6_K",
-    [GGML_TYPE_Q8_K] = "q8_K",
-    [GGML_TYPE_I8]   = "i8",
-    [GGML_TYPE_I16]  = "i16",
-    [GGML_TYPE_I32]  = "i32",
-};
-static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
-
-static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = false,
-    [GGML_TYPE_F16]  = false,
-    [GGML_TYPE_Q4_0] = true,
-    [GGML_TYPE_Q4_1] = true,
-    [GGML_TYPE_Q5_0] = true,
-    [GGML_TYPE_Q5_1] = true,
-    [GGML_TYPE_Q8_0] = true,
-    [GGML_TYPE_Q8_1] = true,
-    [GGML_TYPE_Q2_K] = true,
-    [GGML_TYPE_Q3_K] = true,
-    [GGML_TYPE_Q4_K] = true,
-    [GGML_TYPE_Q5_K] = true,
-    [GGML_TYPE_Q6_K] = true,
-    [GGML_TYPE_Q8_K] = true,
-    [GGML_TYPE_I8]   = false,
-    [GGML_TYPE_I16]  = false,
-    [GGML_TYPE_I32]  = false,
-};
-static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
-
 static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "NONE",

@ -4109,7 +4091,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
    //
    // is enough, but just in case, adding the second part

-    return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
+    return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
 }

 size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
@ -4119,23 +4101,27 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
 size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

-    return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
+    return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
 }

 int ggml_blck_size(enum ggml_type type) {
-    return GGML_BLCK_SIZE[type];
+    return type_traits[type].blck_size;
 }

 size_t ggml_type_size(enum ggml_type type) {
-    return GGML_TYPE_SIZE[type];
+    return type_traits[type].type_size;
 }

 float ggml_type_sizef(enum ggml_type type) {
-    return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
+    return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
 }

 const char * ggml_type_name(enum ggml_type type) {
-    return GGML_TYPE_NAME[type];
+    return type_traits[type].type_name;
+}
+
+bool ggml_is_quantized(enum ggml_type type) {
+    return type_traits[type].is_quantized;
 }

 const char * ggml_op_name(enum ggml_op op) {
@ -4147,7 +4133,7 @@ const char * ggml_op_symbol(enum ggml_op op) {
 }

 size_t ggml_element_size(const struct ggml_tensor * tensor) {
-    return GGML_TYPE_SIZE[tensor->type];
+    return ggml_type_size(tensor->type);
 }

 static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
@ -4185,10 +4171,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct
        (t0->ne[3] == t1->ne[3]);
 }

-bool ggml_is_quantized(enum ggml_type type) {
-    return GGML_IS_QUANTIZED[type];
-}
-
 enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
    enum ggml_type wtype = GGML_TYPE_COUNT;

@ -4226,8 +4208,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return
-        tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
-        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] &&
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
@ -4236,7 +4218,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return
-        tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
@ -4251,7 +4233,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return
-        tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
@ -4570,7 +4552,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
    size_t data_size = 0;

    if (data == NULL && !ctx->no_alloc) {
-        data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
+        data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
        for (int i = 1; i < n_dims; i++) {
            data_size *= ne[i];
        }
@ -4625,8 +4607,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
        result->ne[i] = ne[i];
    }

-    result->nb[0] = GGML_TYPE_SIZE[type];
-    result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]);
+    result->nb[0] = ggml_type_size(type);
+    result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
    for (int i = 2; i < GGML_MAX_DIMS; i++) {
        result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
    }
@ -7748,7 +7730,7 @@ static void ggml_compute_forward_dup_same_cont(
        memcpy(
            ((char *)  dst->data + ie0*nb0),
            ((char *) src0->data + ie0*nb00),
-            (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
+            (ie1 - ie0) * ggml_type_size(src0->type));
    }

 }
@ -7782,7 +7764,7 @@ static void ggml_compute_forward_dup_f16(

    if (src0->type == dst->type &&
        ne00 == ne0 &&
-        nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
+        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
        // copy by rows
        const size_t rs = ne00*nb00;
        for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -7840,7 +7822,7 @@ static void ggml_compute_forward_dup_f16(
                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;

                size_t id = 0;
-                size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
+                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
                char * dst_ptr = (char *) dst->data;

                for (int i03 = 0; i03 < ne03; i03++) {
@ -8053,7 +8035,7 @@ static void ggml_compute_forward_dup_f32(

    if (src0->type == dst->type &&
        ne00 == ne0 &&
-        nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
+        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
        // copy by rows
        const size_t rs = ne00*nb00;
        for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -8092,7 +8074,7 @@ static void ggml_compute_forward_dup_f32(
                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;

                size_t id = 0;
-                size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
+                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
                char * dst_ptr = (char *) dst->data;

                for (int i03 = 0; i03 < ne03; i03++) {
@ -8504,7 +8486,7 @@ static void ggml_compute_forward_add_q_f32(
    ggml_from_float_t const quantize_row_q = type_traits[type].from_float;

    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
+    GGML_ASSERT(nb00 == ggml_type_size(type));
    GGML_ASSERT(nb10 == sizeof(float));

    // dst cannot be transposed or permuted
@ -8778,7 +8760,7 @@ static void ggml_compute_forward_add1_q_f32(
    ggml_from_float_t const quantize_row_q = type_traits[type].from_float;

    // we don't support permuted src0
-    GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
+    GGML_ASSERT(nb00 == ggml_type_size(type));

    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 <= nb1);
@ -10634,7 +10616,7 @@ static void ggml_compute_forward_mul_mat(
    GGML_ASSERT(ne3 == ne13);

    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
+    GGML_ASSERT(nb00 == ggml_type_size(type));
    GGML_ASSERT(nb10 == sizeof(float));

    // dst cannot be transposed or permuted
@ -10717,7 +10699,7 @@ static void ggml_compute_forward_mul_mat(
    if (params->type == GGML_TASK_INIT) {
        if (src1->type != vec_dot_type) {
            char * wdata = params->wdata;
-            const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
+            const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);

            for (int64_t i13 = 0; i13 < ne13; ++i13) {
                for (int64_t i12 = 0; i12 < ne12; ++i12) {
@ -10737,7 +10719,7 @@ static void ggml_compute_forward_mul_mat(
    }

    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
+    const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);

    const int64_t nr0 = ne01;           // src0 rows
    const int64_t nr1 = ne11*ne12*ne13; // src1 rows
@ -11210,7 +11192,7 @@ static void ggml_compute_forward_get_rows_q(

    assert( dst->ne[0] == nc);
    assert( dst->ne[1] == nr);
-    assert(src0->nb[0] == GGML_TYPE_SIZE[type]);
+    assert(src0->nb[0] == ggml_type_size(type));

    for (int i = 0; i < nr; ++i) {
        const int r = ((int32_t *) src1->data)[i];
@ -16387,7 +16369,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {

                    size_t cur = 0;
                    if (ggml_is_quantized(node->type)) {
-                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                    }

                    work_size = MAX(work_size, cur);
@ -16400,7 +16382,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                    size_t cur = 0;

                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                    }

                    work_size = MAX(work_size, cur);
@ -16412,7 +16394,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                    size_t cur = 0;

                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
                    }

                    work_size = MAX(work_size, cur);
@ -16495,12 +16477,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                                     //       the threads are still spinning
                        if (node->src[0]->type != GGML_TYPE_F32) {
                            // here we need memory just for single 2D matrix from src0
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
+                            cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
                        }
                    } else
 #endif
                    if (node->src[1]->type != vec_dot_type) {
-                        cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
+                        cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
                    } else {
                        cur = 0;
                    }
@ -18306,8 +18288,8 @@ enum ggml_opt_result ggml_opt_resume(
        struct ggml_tensor * f) {

    // build forward + backward compute graphs
-    struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
-    struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
+    struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
+    struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));

    struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
    struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
--- a/ggml.h
+++ b/ggml.h
@ -1856,6 +1856,10 @@ extern "C" {
    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);

    typedef struct {
+        const char      * type_name;
+        int               blck_size;
+        size_t            type_size;
+        bool              is_quantized;
        ggml_to_float_t   to_float;
        ggml_from_float_t from_float;
        ggml_from_float_t from_float_reference;
@ -1863,7 +1867,7 @@ extern "C" {
        enum ggml_type    vec_dot_type;
    } ggml_type_traits_t;

-    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
+    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

 #ifdef  __cplusplus
 }
--- a/gguf.py
+++ b/gguf.py
@ -5,7 +5,7 @@ import tempfile
 import numpy as np

 from enum import IntEnum, auto
-from typing import Any, IO, List
+from typing import Any, IO, List, Optional

 #
 # constants
@ -45,7 +45,7 @@ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"

 # RoPE
 KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
-KEY_ROPE_SCALE           = "{arch}.rope.scale"
+KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"

 # tokenization
 KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
@ -61,6 +61,7 @@ KEY_TOKENIZER_PAD_ID     = "tokenizer.ggml.padding_token_id"
 KEY_TOKENIZER_HF_JSON    = "tokenizer.huggingface.json"
 KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"

+
 #
 # recommended mapping of model tensor names for storage in gguf
 #
@ -319,6 +320,15 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:

    return tensor_map

+
+class TokenType(IntEnum):
+    NORMAL       = 1
+    UNKNOWN      = 2
+    CONTROL      = 3
+    USER_DEFINED = 4
+    UNUSED       = 5
+    BYTE         = 6
+
 #
 # implementation
 #
@ -327,6 +337,18 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
 class GGMLQuantizationType(IntEnum):
    F32  = 0
    F16  = 1
+    Q4_0 = 2
+    Q4_1 = 3
+    Q5_0 = 6
+    Q5_1 = 7
+    Q8_0 = 8
+    Q8_1 = 9
+    Q2_K = 10
+    Q3_K = 11
+    Q4_K = 12
+    Q5_K = 13
+    Q6_K = 14
+    Q8_K = 15


 class GGUFValueType(IntEnum):
@ -359,7 +381,7 @@ class GGUFValueType(IntEnum):


 class GGUFWriter:
-    def __init__(self, path: str, arch: str):
+    def __init__(self, path: str, arch: str, use_temp_file = True):
        self.fout = open(path, "wb")
        self.arch = arch
        self.offset_tensor = 0
@ -369,6 +391,8 @@ class GGUFWriter:
        self.ti_data = b""
        self.ti_data_count = 0
        self.add_architecture()
+        self.use_temp_file = use_temp_file
+        self.tensors = []

    def write_header_to_file(self):
        self.fout.write(struct.pack("<I", GGUF_MAGIC))
@ -476,8 +500,8 @@ class GGUFWriter:
    def ggml_pad(x: int, n: int) -> int:
        return ((x + n - 1) // n) * n

-    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int):
-        assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
+    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
+        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"

        encoded_name = name.encode("utf8")
        self.ti_data += struct.pack("<I", len(encoded_name))
@ -486,23 +510,30 @@ class GGUFWriter:
        self.ti_data += struct.pack("<I", n_dims)
        for i in range(n_dims):
            self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
-
+        if raw_dtype is None:
            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+        else:
+            dtype = raw_dtype
        self.ti_data += struct.pack("<I", dtype)
        self.ti_data += struct.pack("<Q", self.offset_tensor)
        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
        self.ti_data_count += 1

-    def add_tensor(self, name: str, tensor: np.ndarray):
-        if not hasattr(self, "temp_file"):
+    def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
+        if self.use_temp_file and not hasattr(self, "temp_file"):
            self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
            self.temp_file.seek(0)

-        self.add_tensor_info(name, tensor.shape, tensor.dtype, tensor.nbytes)
+        self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
+
+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
+
+        if not self.use_temp_file:
+            self.tensors.append((tensor, pad))
+            return

        tensor.tofile(self.temp_file)

-        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
        if pad != 0:
            self.temp_file.write(bytes([0] * pad))

@ -524,6 +555,13 @@ class GGUFWriter:
        if pad != 0:
            self.fout.write(bytes([0] * pad))

+        if not self.use_temp_file:
+            for (currtensor, currpad) in self.tensors:
+                currtensor.tofile(self.fout)
+                if currpad != 0:
+                    self.fout.write(bytes([0] * currpad))
+            return
+
        self.temp_file.seek(0)

        shutil.copyfileobj(self.temp_file, self.fout)
@ -620,8 +658,8 @@ class GGUFWriter:
        self.add_uint32(
            KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)

-    def add_rope_scale(self, value:  float):
-        self.add_float32(KEY_ROPE_SCALE.format(arch=self.arch), value)
+    def add_rope_scale_linear(self, value:  float):
+        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)

    def add_tokenizer_model(self, model: str):
        self.add_string(KEY_TOKENIZER_MODEL, model)
--- a/llama.cpp
+++ b/llama.cpp
@ -771,11 +771,12 @@ struct llama_vocab {

    using id    = int32_t;
    using token = std::string;
+    using ttype = llama_token_type;

    struct token_data {
-        token tok;
+        token text;
        float score;
-        int toktype;
+        ttype type;
    };

    llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@ -1436,6 +1437,14 @@ static void llama_model_load_internal(
        hparams.n_head_kv = hparams.n_head;
        GGUF_GET(hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv");

+        // TODO: manually setting rope scale should override this
+        // rope_freq_scale (inverse of the kv) is optional
+        float ropescale = 1.0f;
+        GGUF_GET(ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear");
+        if (ropescale != 1.0f) {
+            rope_freq_scale = 1.0f/ropescale;
+        }
+
        // get general kv
        GGUF_GET(general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
        GGUF_GET(general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
@ -1513,12 +1522,12 @@ static void llama_model_load_internal(
            vocab.token_to_id[word] = i;

            auto & token_data = vocab.id_to_token[i];
-            token_data.tok = std::move(word);
+            token_data.text  = std::move(word);
            token_data.score = scores[i];
-            token_data.toktype = toktypes[i];
+            token_data.type  = (llama_token_type) toktypes[i];

            // determine the newline token: 0x0A == 10 == '\n'
-            if (token_data.tok == "<0x0A>") {
+            if (token_data.text == "<0x0A>") {
                vocab.linefeed_id = i;
            }
        }
@ -1550,12 +1559,12 @@ static void llama_model_load_internal(
        LLAMA_LOG_INFO("%s: general.name = %s\n",    __func__, general_name.c_str());

        // special tokens
-        if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].tok.c_str() ); }
-        if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].tok.c_str() ); }
-        if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].tok.c_str() ); }
-        if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].tok.c_str() ); }
-        if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].tok.c_str() ); }
-        if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token  = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].tok.c_str() );    }
+        if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
+        if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
+        if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
+        if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
+        if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
+        if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token  = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
    }

    if (vocab_only) {
@ -2347,15 +2356,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
 }

 static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 1;
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
 }

 static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 2;
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
 }

 static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 3;
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
+}
+
+static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
+}
+
+static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
+}
+
+static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
 }

 static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
@ -2373,22 +2394,10 @@ static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
    return id == vocab.special_pad_id;
 }

-static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 4;
-}
-
-static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 5;
-}
-
-static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 6;
-}
-
 static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
    GGML_ASSERT(llama_is_byte_token(vocab, id));
    const auto& token_data = vocab.id_to_token.at(id);
-    auto buf = token_data.tok.substr(3, 2);
+    auto buf = token_data.text.substr(3, 2);
    return strtol(buf.c_str(), NULL, 16);
 }

@ -2701,6 +2710,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(

    bool found            = false;
    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
+
    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT

    do {
@ -4949,25 +4959,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
    return ctx->embedding.data();
 }

-int llama_get_vocab(
-        const struct llama_context * ctx,
-        const char * * strings,
-        float  * scores,
-        int capacity) {
-    return llama_model_get_vocab(&ctx->model, strings, scores, capacity);
+const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
+    return ctx->model.vocab.id_to_token[token].text.c_str();
 }

-int llama_model_get_vocab(
-        const struct llama_model * model,
-        const char * * strings,
-        float  * scores,
-        int capacity) {
-    int n = std::min(capacity, (int) model->vocab.id_to_token.size());
-    for (int i = 0; i<n; ++i) {
-        strings[i] = model->vocab.id_to_token[i].tok.c_str();
-        scores[i]  = model->vocab.id_to_token[i].score;
+float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
+    return ctx->model.vocab.id_to_token[token].score;
 }
-    return n;
+
+llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
+    return ctx->model.vocab.id_to_token[token].type;
 }

 llama_token llama_token_bos(const struct llama_context * ctx) {
@ -5038,7 +5039,7 @@ int llama_token_to_str(const struct llama_context * ctx, llama_token token, char

 int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) {
    if (0 <= token && token < llama_model_n_vocab(&ctx->model)) {
-        std::string result = ctx->model.vocab.id_to_token[token].tok;
+        std::string result = ctx->model.vocab.id_to_token[token].text;
        if (length < (int) result.length()) {
            return -result.length();
        }
@ -5052,7 +5053,7 @@ int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token,
 int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
    if (0 <= token && token < llama_model_n_vocab(model)) {
        if (llama_is_normal_token(model->vocab, token)) {
-            std::string result = model->vocab.id_to_token[token].tok;
+            std::string result = model->vocab.id_to_token[token].text;
            if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
                result = llama_unescape_whitespace(result);
            }
--- a/llama.h
+++ b/llama.h
@ -72,6 +72,16 @@ extern "C" {
        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
    };

+    enum llama_token_type {
+        LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
+        LLAMA_TOKEN_TYPE_NORMAL       = 1,
+        LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
+        LLAMA_TOKEN_TYPE_CONTROL      = 3,
+        LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
+        LLAMA_TOKEN_TYPE_UNUSED       = 5,
+        LLAMA_TOKEN_TYPE_BYTE         = 6,
+    };
+
    // model file types
    enum llama_ftype {
        LLAMA_FTYPE_ALL_F32              = 0,
@ -330,19 +340,11 @@ extern "C" {
    // Vocab
    //

-    // Get the vocabulary as output parameters.
-    // Returns number of results.
-    LLAMA_API int llama_get_vocab(
-            const struct llama_context * ctx,
-                          const char * * strings,
-                                 float * scores,
-                                   int   capacity);
+    LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);

-    LLAMA_API int llama_model_get_vocab(
-              const struct llama_model * model,
-                          const char * * strings,
-                                 float * scores,
-                                   int   capacity);
+    LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
+
+    LLAMA_API llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);

    // Special tokens
    LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence