From 7af633aec339367e36c867ae709088d6a801aa75 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 18 Aug 2023 17:48:31 +0300
Subject: [PATCH 01/25] readme : incoming BREAKING CHANGE

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/README.md b/README.md
index 8e467f159..da1c188cf 100644
--- a/README.md
+++ b/README.md
@@ -9,13 +9,13 @@
 
 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
-**Hot topics:**
+### 🚧 Incoming breaking change + refactoring:
 
-- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
-- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
-- New roadmap: https://github.com/users/ggerganov/projects/7
-- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
-- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
+See PR https://github.com/ggerganov/llama.cpp/pull/2398 for more info.
+
+To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
+
+----
 
 <details>
   <summary>Table of Contents</summary>

From 2d8b76a110d76ff6b5728ff0af8477531e4db60e Mon Sep 17 00:00:00 2001
From: Adrian <smith.adriane@gmail.com>
Date: Fri, 18 Aug 2023 12:39:22 -0700
Subject: [PATCH 02/25] Add link to clojure bindings to Readme. (#2659)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index da1c188cf..9f8512dc5 100644
--- a/README.md
+++ b/README.md
@@ -99,6 +99,7 @@ as the main playground for developing new features for the [ggml](https://github
 - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
+- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 
 **UI:**
 

From f63564adfaa157ca387071d6b9a06cfaef0ef576 Mon Sep 17 00:00:00 2001
From: Jhen-Jie Hong <iainst0409@gmail.com>
Date: Sat, 19 Aug 2023 05:41:32 +0800
Subject: [PATCH 03/25] server : update xxd usage for older versions
 compatibility (#2649)

* server : update xxd usage for older versions compatibility

* remove unused $func
---
 examples/server/deps.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/server/deps.sh b/examples/server/deps.sh
index 1e9fe964b..ea23e6450 100755
--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@@ -11,8 +11,10 @@ echo >> $PUBLIC/index.js # add newline
 
 FILES=$(ls $PUBLIC)
 
+cd $PUBLIC
 for FILE in $FILES; do
-  func=$(echo $FILE | tr '.' '_')
-  echo "generate $FILE.hpp ($func)"
-  xxd -n $func -i $PUBLIC/$FILE > $DIR/$FILE.hpp
+  echo "generate $FILE.hpp"
+
+  # use simple flag for old version of xxd
+  xxd -i $FILE > $DIR/$FILE.hpp
 done

From 1f0bccb27929e261744c979bc75114955da49e98 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 19 Aug 2023 00:45:36 +0300
Subject: [PATCH 04/25] server : better default prompt (#2646)

---
 examples/server/public/index.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index f204fff18..5eedb0b28 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -144,12 +144,12 @@
     import { SchemaConverter } from '/json-schema-to-grammar.mjs';
 
     const session = signal({
-      prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
+      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
       template: "{{prompt}}\n\n{{history}}\n{{char}}:",
       historyTemplate: "{{name}}: {{message}}",
       transcript: [],
       type: "chat",
-      char: "llama",
+      char: "Llama",
       user: "User",
     })
 

From 5e9ff54a675d163d9f42aad1b5b3e734f17b2701 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Sun, 20 Aug 2023 16:44:46 +0300
Subject: [PATCH 05/25] More efficient Hellaswag implementation (#2677)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 examples/perplexity/perplexity.cpp | 92 +++++++++++++++++++++++-------
 1 file changed, 70 insertions(+), 22 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index b9b28a20b..682c39b16 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -5,6 +5,7 @@
 #include <cmath>
 #include <ctime>
 #include <sstream>
+#include <cstring>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -209,50 +210,97 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     double acc = 0.0f;
     const int n_vocab = llama_n_vocab(ctx);
 
+    std::vector<float> tok_logits(n_vocab);
+
     for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
 
         // Tokenize the context to count tokens
         std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
         size_t context_size = context_embd.size();
 
-        for (size_t ending_idx=0;ending_idx<4;ending_idx++) {
+        // Do the 1st ending
+        // In this case we include the context when evaluating
+        auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
+        auto query_size = query_embd.size();
+        //printf("First query: %d\n",(int)query_size);
+
+        // Stop if query wont fit the ctx window
+        if (query_size > (size_t)params.n_ctx) {
+            fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
+            return;
+        }
+
+        // Speedup small evaluations by evaluating atleast 32 tokens
+        if (query_size < 32) {
+            query_embd.resize(32);
+        }
+
+        // Evaluate the query
+        if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return;
+        }
+
+        auto query_logits = llama_get_logits(ctx);
+
+        std::memcpy(tok_logits.data(), query_logits + (context_size-1)*n_vocab, n_vocab*sizeof(float));
+        const auto first_probs = softmax(tok_logits);
+
+        hs_data[task_idx].ending_logprob_count[0] = 1;
+        hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);
+
+        // Calculate the logprobs over the ending
+        for (size_t j = context_size; j < query_size - 1; j++) {
+
+            std::memcpy(tok_logits.data(), query_logits + j*n_vocab, n_vocab*sizeof(float));
+
+            const float prob = softmax(tok_logits)[query_embd[j + 1]];
+
+            hs_data[task_idx].ending_logprob[0] += std::log(prob);
+            hs_data[task_idx].ending_logprob_count[0]++;
+        }
+
+        // Calculate the mean token logprob for acc_norm
+        hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];
+
+        // Do the remaining endings
+        // For these, we use the bare ending with n_past = context_size
+        //
+        for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
 
             // Tokenize the query
-            std::vector<int> query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos);
-            size_t query_size = query_embd.size();
+            query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
+            query_size = query_embd.size();
+            //printf("Second query: %d\n",(int)query_size);
 
             // Stop if query wont fit the ctx window
-            if (query_size > (size_t)params.n_ctx) {
+            if (context_size + query_size > (size_t)params.n_ctx) {
                 fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
                 return;
             }
 
             // Speedup small evaluations by evaluating atleast 32 tokens
-            if (query_size < 32) {
-                query_embd.resize(32);
-            }
+            // No, resizing to 32 is actually slightly slower (at least on CUDA)
+            //if (query_size < 32) {
+            //    query_embd.resize(32);
+            //}
 
             // Evaluate the query
-            if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
+            if (llama_eval(ctx, query_embd.data(), query_embd.size(), context_size, params.n_threads)) {
                 fprintf(stderr, "%s : failed to eval\n", __func__);
                 return;
             }
 
-            const auto query_logits = llama_get_logits(ctx);
-            std::vector<float> logits;
-            logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab);
+            query_logits = llama_get_logits(ctx);
 
-            hs_data[task_idx].ending_logprob_count[ending_idx] = 0;
-            hs_data[task_idx].ending_logprob[ending_idx] = 0.0f;
+            hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
+            hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
 
             // Calculate the logprobs over the ending
-            for (size_t j = context_size-1; j < query_size - 1; j++) {
-                // Calculate probability of next token, given the previous ones.
-                const std::vector<float> tok_logits(
-                    logits.begin() + (j + 0) * n_vocab,
-                    logits.begin() + (j + 1) * n_vocab);
+            for (size_t j = 0; j < query_size - 1; j++) {
+                std::memcpy(tok_logits.data(), query_logits + j*n_vocab, n_vocab*sizeof(float));
 
-                const float prob = softmax(tok_logits)[query_embd[ j + 1]];
+                const float prob = softmax(tok_logits)[query_embd[j + 1]];
 
                 hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
                 hs_data[task_idx].ending_logprob_count[ending_idx]++;
@@ -267,9 +315,9 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         }
 
         // Find the ending with maximum logprob
-        size_t ending_logprob_max_idx = -1;
-        double ending_logprob_max_val = -INFINITY;
-        for (size_t j=0; j < 4; j++) {
+        size_t ending_logprob_max_idx = 0;
+        double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
+        for (size_t j = 1; j < 4; j++) {
             if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
                 ending_logprob_max_idx = j;
                 ending_logprob_max_val =  hs_data[task_idx].ending_logprob[j];

From 9e232f0234073358e7031c1b8d7aa45020469a3b Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Sun, 20 Aug 2023 22:17:53 +0200
Subject: [PATCH 06/25] ggml : move all type info to ggml_type_traits (#2663)

---
 ggml.c | 245 ++++++++++++++++++++++++++-------------------------------
 ggml.h |   6 +-
 2 files changed, 118 insertions(+), 133 deletions(-)

diff --git a/ggml.c b/ggml.c
index beb7f4641..44c43b424 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1643,11 +1643,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
 static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 
 static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_I8] = {
+        .type_name                = "i8",
+        .blck_size                = 1,
+        .type_size                = sizeof(int8_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_I16] = {
+        .type_name                = "i16",
+        .blck_size                = 1,
+        .type_size                = sizeof(int16_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_I32] = {
+        .type_name                = "i32",
+        .blck_size                = 1,
+        .type_size                = sizeof(int32_t),
+        .is_quantized             = false,
+    },
     [GGML_TYPE_F32] = {
+        .type_name                = "f32",
+        .blck_size                = 1,
+        .type_size                = sizeof(float),
+        .is_quantized             = false,
         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
         .vec_dot_type             = GGML_TYPE_F32,
     },
     [GGML_TYPE_F16] = {
+        .type_name                = "f16",
+        .blck_size                = 1,
+        .type_size                = sizeof(ggml_fp16_t),
+        .is_quantized             = false,
         .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
         .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
         .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
@@ -1655,6 +1681,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_F16,
     },
     [GGML_TYPE_Q4_0] = {
+        .type_name                = "q4_0",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
         .from_float               = quantize_row_q4_0,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
@@ -1662,6 +1692,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
     },
     [GGML_TYPE_Q4_1] = {
+        .type_name                = "q4_1",
+        .blck_size                = QK4_1,
+        .type_size                = sizeof(block_q4_1),
+        .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
         .from_float               = quantize_row_q4_1,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
@@ -1669,6 +1703,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_1,
     },
     [GGML_TYPE_Q5_0] = {
+        .type_name                = "q5_0",
+        .blck_size                = QK5_0,
+        .type_size                = sizeof(block_q5_0),
+        .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
         .from_float               = quantize_row_q5_0,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
@@ -1676,6 +1714,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
     },
     [GGML_TYPE_Q5_1] = {
+        .type_name                = "q5_1",
+        .blck_size                = QK5_1,
+        .type_size                = sizeof(block_q5_1),
+        .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
         .from_float               = quantize_row_q5_1,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
@@ -1683,6 +1725,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_1,
     },
     [GGML_TYPE_Q8_0] = {
+        .type_name                = "q8_0",
+        .blck_size                = QK8_0,
+        .type_size                = sizeof(block_q8_0),
+        .is_quantized             = true,
         .to_float                 = dequantize_row_q8_0,
         .from_float               = quantize_row_q8_0,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
@@ -1690,12 +1736,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
     },
     [GGML_TYPE_Q8_1] = {
+        .type_name                = "q8_1",
+        .blck_size                = QK8_1,
+        .type_size                = sizeof(block_q8_1),
+        .is_quantized             = true,
         .from_float               = quantize_row_q8_1,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
         .vec_dot_type             = GGML_TYPE_Q8_1,
     },
 #ifdef GGML_USE_K_QUANTS
     [GGML_TYPE_Q2_K] = {
+        .type_name                = "q2_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q2_K),
+        .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
         .from_float               = quantize_row_q2_K,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
@@ -1703,6 +1757,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_K,
     },
     [GGML_TYPE_Q3_K] = {
+        .type_name                = "q3_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q3_K),
+        .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
         .from_float               = quantize_row_q3_K,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
@@ -1710,6 +1768,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_K,
     },
     [GGML_TYPE_Q4_K] = {
+        .type_name                = "q4_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q4_K),
+        .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
         .from_float               = quantize_row_q4_K,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
@@ -1717,6 +1779,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_K,
     },
     [GGML_TYPE_Q5_K] = {
+        .type_name                = "q5_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q5_K),
+        .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
         .from_float               = quantize_row_q5_K,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
@@ -1724,6 +1790,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_K,
     },
     [GGML_TYPE_Q6_K] = {
+        .type_name                = "q6_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q6_K),
+        .is_quantized             = true,
         .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
         .from_float               = quantize_row_q6_K,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
@@ -1731,15 +1801,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_K,
     },
     [GGML_TYPE_Q8_K] = {
+        .type_name                = "q8_K",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q8_K),
+        .is_quantized             = true,
         .from_float               = quantize_row_q8_K,
     }
 #endif
 };
 
 // For internal test use
-ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
-    GGML_ASSERT(i < GGML_TYPE_COUNT);
-    return type_traits[i];
+ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
+    GGML_ASSERT(type < GGML_TYPE_COUNT);
+    return type_traits[type];
 }
 
 
@@ -3648,99 +3722,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
     *s = idx;
 }
 
-//
-// data types
-//
-
-static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = 1,
-    [GGML_TYPE_F16]  = 1,
-    [GGML_TYPE_Q4_0] = QK4_0,
-    [GGML_TYPE_Q4_1] = QK4_1,
-    [GGML_TYPE_Q5_0] = QK5_0,
-    [GGML_TYPE_Q5_1] = QK5_1,
-    [GGML_TYPE_Q8_0] = QK8_0,
-    [GGML_TYPE_Q8_1] = QK8_1,
-#ifdef GGML_USE_K_QUANTS
-    [GGML_TYPE_Q2_K] = QK_K,
-    [GGML_TYPE_Q3_K] = QK_K,
-    [GGML_TYPE_Q4_K] = QK_K,
-    [GGML_TYPE_Q5_K] = QK_K,
-    [GGML_TYPE_Q6_K] = QK_K,
-    [GGML_TYPE_Q8_K] = QK_K,
-#endif
-    [GGML_TYPE_I8]   = 1,
-    [GGML_TYPE_I16]  = 1,
-    [GGML_TYPE_I32]  = 1,
-};
-static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
-
-static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = sizeof(float),
-    [GGML_TYPE_F16]  = sizeof(ggml_fp16_t),
-    [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
-    [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
-    [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
-    [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
-    [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
-    [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
-#ifdef GGML_USE_K_QUANTS
-    [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
-    [GGML_TYPE_Q3_K] = sizeof(block_q3_K),
-    [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
-    [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
-    [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
-    [GGML_TYPE_Q8_K] = sizeof(block_q8_K),
-#endif
-    [GGML_TYPE_I8]   = sizeof(int8_t),
-    [GGML_TYPE_I16]  = sizeof(int16_t),
-    [GGML_TYPE_I32]  = sizeof(int32_t),
-};
-static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
-
-
-static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = "f32",
-    [GGML_TYPE_F16]  = "f16",
-    [GGML_TYPE_Q4_0] = "q4_0",
-    [GGML_TYPE_Q4_1] = "q4_1",
-    [GGML_TYPE_Q5_0] = "q5_0",
-    [GGML_TYPE_Q5_1] = "q5_1",
-    [GGML_TYPE_Q8_0] = "q8_0",
-    [GGML_TYPE_Q8_1] = "q8_1",
-    [GGML_TYPE_Q2_K] = "q2_K",
-    [GGML_TYPE_Q3_K] = "q3_K",
-    [GGML_TYPE_Q4_K] = "q4_K",
-    [GGML_TYPE_Q5_K] = "q5_K",
-    [GGML_TYPE_Q6_K] = "q6_K",
-    [GGML_TYPE_Q8_K] = "q8_K",
-    [GGML_TYPE_I8]   = "i8",
-    [GGML_TYPE_I16]  = "i16",
-    [GGML_TYPE_I32]  = "i32",
-};
-static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
-
-static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = false,
-    [GGML_TYPE_F16]  = false,
-    [GGML_TYPE_Q4_0] = true,
-    [GGML_TYPE_Q4_1] = true,
-    [GGML_TYPE_Q5_0] = true,
-    [GGML_TYPE_Q5_1] = true,
-    [GGML_TYPE_Q8_0] = true,
-    [GGML_TYPE_Q8_1] = true,
-    [GGML_TYPE_Q2_K] = true,
-    [GGML_TYPE_Q3_K] = true,
-    [GGML_TYPE_Q4_K] = true,
-    [GGML_TYPE_Q5_K] = true,
-    [GGML_TYPE_Q6_K] = true,
-    [GGML_TYPE_Q8_K] = true,
-    [GGML_TYPE_I8]   = false,
-    [GGML_TYPE_I16]  = false,
-    [GGML_TYPE_I32]  = false,
-};
-static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
-
 static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "NONE",
 
@@ -4110,29 +4091,33 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
     //
     // is enough, but just in case, adding the second part
 
-    return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
+    return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type), GGML_MEM_ALIGN);
 }
 
 size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
-    return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
+    return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
 }
 
 int ggml_blck_size(enum ggml_type type) {
-    return GGML_BLCK_SIZE[type];
+    return type_traits[type].blck_size;
 }
 
 size_t ggml_type_size(enum ggml_type type) {
-    return GGML_TYPE_SIZE[type];
+    return type_traits[type].type_size;
 }
 
 float ggml_type_sizef(enum ggml_type type) {
-    return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
+    return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
 }
 
 const char * ggml_type_name(enum ggml_type type) {
-    return GGML_TYPE_NAME[type];
+    return type_traits[type].type_name;
+}
+
+bool ggml_is_quantized(enum ggml_type type) {
+    return type_traits[type].is_quantized;
 }
 
 const char * ggml_op_name(enum ggml_op op) {
@@ -4144,7 +4129,7 @@ const char * ggml_op_symbol(enum ggml_op op) {
 }
 
 size_t ggml_element_size(const struct ggml_tensor * tensor) {
-    return GGML_TYPE_SIZE[tensor->type];
+    return ggml_type_size(tensor->type);
 }
 
 static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
@@ -4182,10 +4167,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct
         (t0->ne[3] == t1->ne[3]);
 }
 
-bool ggml_is_quantized(enum ggml_type type) {
-    return GGML_IS_QUANTIZED[type];
-}
-
 enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
     enum ggml_type wtype = GGML_TYPE_COUNT;
 
@@ -4223,8 +4204,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return
-        tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
-        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] &&
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
         tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
         tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
@@ -4233,7 +4214,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return
-        tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
         tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
         tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
@@ -4248,7 +4229,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return
-        tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
         tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
         tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
@@ -4567,7 +4548,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
     size_t data_size = 0;
 
     if (data == NULL && !ctx->no_alloc) {
-        data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
+        data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
         for (int i = 1; i < n_dims; i++) {
             data_size *= ne[i];
         }
@@ -4622,8 +4603,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         result->ne[i] = ne[i];
     }
 
-    result->nb[0] = GGML_TYPE_SIZE[type];
-    result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]);
+    result->nb[0] = ggml_type_size(type);
+    result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
     for (int i = 2; i < GGML_MAX_DIMS; i++) {
         result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
     }
@@ -7745,7 +7726,7 @@ static void ggml_compute_forward_dup_same_cont(
         memcpy(
             ((char *)  dst->data + ie0*nb0),
             ((char *) src0->data + ie0*nb00),
-            (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
+            (ie1 - ie0) * ggml_type_size(src0->type));
     }
 
 }
@@ -7779,7 +7760,7 @@ static void ggml_compute_forward_dup_f16(
 
     if (src0->type == dst->type &&
         ne00 == ne0 &&
-        nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
+        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
         // copy by rows
         const size_t rs = ne00*nb00;
         for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -7837,7 +7818,7 @@ static void ggml_compute_forward_dup_f16(
                 float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
 
                 size_t id = 0;
-                size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
+                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
                 char * dst_ptr = (char *) dst->data;
 
                 for (int i03 = 0; i03 < ne03; i03++) {
@@ -8050,7 +8031,7 @@ static void ggml_compute_forward_dup_f32(
 
     if (src0->type == dst->type &&
         ne00 == ne0 &&
-        nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
+        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
         // copy by rows
         const size_t rs = ne00*nb00;
         for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -8089,7 +8070,7 @@ static void ggml_compute_forward_dup_f32(
                 ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
 
                 size_t id = 0;
-                size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
+                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
                 char * dst_ptr = (char *) dst->data;
 
                 for (int i03 = 0; i03 < ne03; i03++) {
@@ -8501,7 +8482,7 @@ static void ggml_compute_forward_add_q_f32(
     ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
 
     // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
+    GGML_ASSERT(nb00 == ggml_type_size(type));
     GGML_ASSERT(nb10 == sizeof(float));
 
     // dst cannot be transposed or permuted
@@ -8775,7 +8756,7 @@ static void ggml_compute_forward_add1_q_f32(
     ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
 
     // we don't support permuted src0
-    GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
+    GGML_ASSERT(nb00 == ggml_type_size(type));
 
     // dst cannot be transposed or permuted
     GGML_ASSERT(nb0 <= nb1);
@@ -10629,7 +10610,7 @@ static void ggml_compute_forward_mul_mat(
     GGML_ASSERT(ne3 == ne13);
 
     // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
+    GGML_ASSERT(nb00 == ggml_type_size(type));
     GGML_ASSERT(nb10 == sizeof(float));
 
     // dst cannot be transposed or permuted
@@ -10712,7 +10693,7 @@ static void ggml_compute_forward_mul_mat(
     if (params->type == GGML_TASK_INIT) {
         if (src1->type != vec_dot_type) {
             char * wdata = params->wdata;
-            const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
+            const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
 
             for (int64_t i13 = 0; i13 < ne13; ++i13) {
                 for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -10732,7 +10713,7 @@ static void ggml_compute_forward_mul_mat(
     }
 
     const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
+    const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
 
     const int64_t nr0 = ne01;           // src0 rows
     const int64_t nr1 = ne11*ne12*ne13; // src1 rows
@@ -11205,7 +11186,7 @@ static void ggml_compute_forward_get_rows_q(
 
     assert( dst->ne[0] == nc);
     assert( dst->ne[1] == nr);
-    assert(src0->nb[0] == GGML_TYPE_SIZE[type]);
+    assert(src0->nb[0] == ggml_type_size(type));
 
     for (int i = 0; i < nr; ++i) {
         const int r = ((int32_t *) src1->data)[i];
@@ -16382,7 +16363,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
                     size_t cur = 0;
                     if (ggml_is_quantized(node->type)) {
-                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                     }
 
                     work_size = MAX(work_size, cur);
@@ -16395,7 +16376,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                     size_t cur = 0;
 
                     if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                     }
 
                     work_size = MAX(work_size, cur);
@@ -16407,7 +16388,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                     size_t cur = 0;
 
                     if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
                     }
 
                     work_size = MAX(work_size, cur);
@@ -16490,12 +16471,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                                      //       the threads are still spinning
                         if (node->src[0]->type != GGML_TYPE_F32) {
                             // here we need memory just for single 2D matrix from src0
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
+                            cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
                         }
                     } else
 #endif
                     if (node->src[1]->type != vec_dot_type) {
-                        cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
+                        cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
                     } else {
                         cur = 0;
                     }
@@ -18301,8 +18282,8 @@ enum ggml_opt_result ggml_opt_resume(
         struct ggml_tensor * f) {
 
     // build forward + backward compute graphs
-    struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
-    struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
+    struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
+    struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
 
     struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
     struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
diff --git a/ggml.h b/ggml.h
index bdbd12800..3a946dbdc 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1740,6 +1740,10 @@ extern "C" {
     typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
 
     typedef struct {
+        const char      * type_name;
+        int               blck_size;
+        size_t            type_size;
+        bool              is_quantized;
         ggml_to_float_t   to_float;
         ggml_from_float_t from_float;
         ggml_from_float_t from_float_reference;
@@ -1747,7 +1751,7 @@ extern "C" {
         enum ggml_type    vec_dot_type;
     } ggml_type_traits_t;
 
-    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
+    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 
 #ifdef  __cplusplus
 }

From 5a02b9625af73a449091880b64cb5d0b79ad02fe Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 21 Aug 2023 03:24:29 +0200
Subject: [PATCH 07/25] convert-permute-debug.py : permute debug print

---
 convert-permute-debug.py | 1032 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 1032 insertions(+)
 create mode 100644 convert-permute-debug.py

diff --git a/convert-permute-debug.py b/convert-permute-debug.py
new file mode 100644
index 000000000..14927e8e4
--- /dev/null
+++ b/convert-permute-debug.py
@@ -0,0 +1,1032 @@
+#!/usr/bin/env python
+
+import gguf
+import argparse
+import concurrent.futures
+import copy
+import enum
+import faulthandler
+import functools
+import io
+import itertools
+import json
+import math
+import mmap
+import pickle
+import re
+import signal
+import struct
+import sys
+import zipfile
+import numpy as np
+
+from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union)
+from sentencepiece import SentencePieceProcessor  # type: ignore
+
+if TYPE_CHECKING:
+    from typing_extensions import TypeAlias
+
+if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
+    faulthandler.register(signal.SIGUSR1)
+
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+ARCH=gguf.MODEL_ARCH.LLAMA
+NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
+
+#
+# data types
+#
+
+@dataclass(frozen=True)
+class UnquantizedDataType:
+    name: str
+
+DT_F16  = UnquantizedDataType('F16')
+DT_F32  = UnquantizedDataType('F32')
+DT_I32  = UnquantizedDataType('I32')
+DT_BF16 = UnquantizedDataType('BF16')
+
+DataType = Union[UnquantizedDataType]
+
+DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
+    DT_BF16: np.dtype(np.uint16),
+    DT_F16:  np.dtype(np.float16),
+    DT_F32:  np.dtype(np.float32),
+    DT_I32:  np.dtype(np.int32),
+}
+
+NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
+    {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
+
+SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
+    'BF16': DT_BF16,
+    'F16': DT_F16,
+    'F32': DT_F32,
+    'I32': DT_I32,
+}
+
+class GGMLFileType(enum.Enum):
+    AllF32    = 0
+    MostlyF16 = 1  # except 1d tensors
+
+    def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
+        if len(tensor.shape) == 1:
+            # 1D tensors are always F32.
+            return DT_F32
+        elif self == GGMLFileType.AllF32:
+            return DT_F32
+        elif self == GGMLFileType.MostlyF16:
+            return DT_F16
+        else:
+            raise ValueError(self)
+
+
+#
+# hparams loading
+#
+
+@dataclass
+class Params:
+    n_vocab:    int
+    n_embd:     int
+    n_mult:     int
+    n_layer:    int
+    n_ctx:      int
+    n_ff:       int
+    n_head:     int
+    n_head_kv:  int
+    f_norm_eps: float
+
+    @staticmethod
+    def find_n_mult(n_ff: int, n_embd: int) -> int:
+        # hardcoded magic range
+        for n_mult in range(8192, 1, -1):
+            calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
+            if calc_ff == n_ff:
+                return n_mult
+        raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
+
+    @staticmethod
+    def guessed(model: 'LazyModel') -> 'Params':
+        # try transformer naming first
+        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
+
+        # try transformer naming first
+        if "model.layers.0.self_attn.q_proj.weight" in model:
+            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
+            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
+        else:
+            n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
+
+        if n_layer < 1:
+            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+
+        n_head = n_embd // 128 # guessed
+        n_mult = 256           # guessed
+
+        # TODO: verify this
+        n_ff = int(2 * (4 * n_embd) / 3)
+        n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
+
+        return Params(
+            n_vocab    = n_vocab,
+            n_embd     = n_embd,
+            n_mult     = n_mult,
+            n_layer    = n_layer,
+            n_ctx      = -1,
+            n_ff       = n_ff,
+            n_head     = n_head,
+            n_head_kv  = n_head,
+            f_norm_eps = 1e-5,
+        )
+
+    @staticmethod
+    def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
+        config = json.load(open(config_path))
+
+        n_vocab    = config["vocab_size"]
+        n_embd     = config["hidden_size"]
+        n_layer    = config["num_hidden_layers"]
+        n_ff       = config["intermediate_size"]
+        n_head     = config["num_attention_heads"]
+        n_head_kv  = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
+        f_norm_eps = config["rms_norm_eps"]
+
+        n_mult = Params.find_n_mult(n_ff, n_embd)
+
+        if "max_sequence_length" in config:
+            n_ctx = config["max_sequence_length"]
+        elif "max_position_embeddings" in config:
+            n_ctx = config["max_position_embeddings"]
+        else:
+            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+
+        return Params(
+            n_vocab    = n_vocab,
+            n_embd     = n_embd,
+            n_mult     = n_mult,
+            n_layer    = n_layer,
+            n_ctx      = n_ctx,
+            n_ff       = n_ff,
+            n_head     = n_head,
+            n_head_kv  = n_head_kv,
+            f_norm_eps = f_norm_eps,
+        )
+
+    # LLaMA v2 70B params.json
+    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
+    @staticmethod
+    def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
+        config = json.load(open(config_path))
+
+        n_vocab    = config["vocab_size"]
+        n_embd     = config["dim"]
+        n_layer    = config["n_layers"]
+        n_mult     = config["multiple_of"]
+        n_ctx      = 2048 if config["norm_eps"] == 1e-06 else 4096 # hack to determine LLaMA v1 vs v2
+        n_ff       = -1
+        n_head     = config["n_heads"]
+        n_head_kv  = config["n_kv_heads"] if "n_kv_heads" in config else n_head
+        f_norm_eps = config["norm_eps"]
+
+        if n_vocab == -1:
+            n_vocab = model["tok_embeddings.weight"].shape[0]
+
+        if n_ff == -1:
+            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
+
+        return Params(
+            n_vocab    = n_vocab,
+            n_embd     = n_embd,
+            n_mult     = n_mult,
+            n_layer    = n_layer,
+            n_ctx      = n_ctx,
+            n_ff       = n_ff,
+            n_head     = n_head,
+            n_head_kv  = n_head_kv,
+            f_norm_eps = f_norm_eps,
+        )
+
+    @staticmethod
+    def load(model_plus: 'ModelPlus') -> 'Params':
+        hf_config_path   = model_plus.paths[0].parent / "config.json"
+        orig_config_path = model_plus.paths[0].parent / "params.json"
+
+        if hf_config_path.exists():
+            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
+        elif orig_config_path.exists():
+            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
+        else:
+            params = Params.guessed(model_plus.model)
+
+        return params
+
+
+#
+# vocab
+#
+
+class BpeVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
+        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
+        added_tokens: Dict[str, int]
+        if fname_added_tokens is not None:
+            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
+        else:
+            added_tokens = {}
+        vocab_size: int = len(self.bpe_tokenizer)
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_list = [text for (text, idx) in items]
+        self.vocab_size_base: int = vocab_size
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens
+
+    def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        tokenizer = self.bpe_tokenizer
+        from transformers.models.gpt2 import tokenization_gpt2
+        byte_encoder = tokenization_gpt2.bytes_to_unicode()
+        byte_decoder = {v: k for k, v in byte_encoder.items()}
+        for i, item in enumerate(tokenizer):
+            text: bytes = item.encode("utf-8")
+            score: float = -i
+            yield text, score
+
+    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        yield from self.bpe_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class SentencePieceVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+        added_tokens: Dict[str, int]
+        if fname_added_tokens is not None:
+            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
+        else:
+            added_tokens = {}
+        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids   = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_list = [text for (text, idx) in items]
+        self.vocab_size_base: int = vocab_size
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens
+
+    def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        tokenizer = self.sentencepiece_tokenizer
+        for i in range(tokenizer.vocab_size()):
+            piece = tokenizer.id_to_piece(i)
+            text: bytes = piece.encode("utf-8")
+            score: float = tokenizer.get_score(i)
+            yield text, score
+
+    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        yield from self.sentencepiece_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+Vocab = Union[BpeVocab, SentencePieceVocab]
+
+
+#
+# data loading
+# TODO: reuse (probably move to gguf.py?)
+#
+
+def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
+    print( "permute " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_head_kv) )
+    if n_head_kv is not None and n_head != n_head_kv:
+        n_head //= n_head_kv
+    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+                .swapaxes(1, 2)
+                .reshape(weights.shape))
+
+
+class Tensor(metaclass=ABCMeta):
+    data_type: DataType
+
+    @abstractmethod
+    def astype(self, data_type: DataType) -> 'Tensor': ...
+    @abstractmethod
+    def permute(self, n_head: int, n_head_kv: int) -> 'Tensor': ...
+    @abstractmethod
+    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
+    @abstractmethod
+    def part(self, n_part: int) -> 'UnquantizedTensor': ...
+    @abstractmethod
+    def to_ggml(self) -> 'GGMLCompatibleTensor': ...
+
+
+def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
+    assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
+    fp32_arr = bf16_arr.astype(np.uint32) << 16
+    return fp32_arr.view(np.float32)
+
+
+class UnquantizedTensor(Tensor):
+    def __init__(self, ndarray: NDArray) -> None:
+        assert isinstance(ndarray, np.ndarray)
+        self.ndarray = ndarray
+        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
+
+    def astype(self, data_type: DataType) -> Tensor:
+        dtype = DATA_TYPE_TO_NUMPY[data_type]
+        if self.data_type == DT_BF16:
+            self.ndarray = bf16_to_fp32(self.ndarray)
+        return UnquantizedTensor(self.ndarray.astype(dtype))
+
+    def to_ggml(self) -> 'UnquantizedTensor':
+        return self
+
+    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
+        r = self.ndarray.shape[0] // 3
+        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
+
+    def part(self, n_part: int) -> 'UnquantizedTensor':
+        r = self.ndarray.shape[0] // 3
+        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
+
+    def permute(self, n_head: int, n_head_kv: int) -> 'UnquantizedTensor':
+        return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
+
+
+def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
+    tensor = lazy_tensor.load()
+    assert isinstance(tensor, UnquantizedTensor)
+
+    # double-check:
+    actual_shape = list(tensor.ndarray.shape)
+    assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
+    if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
+        if convert:
+            tensor.ndarray = tensor.ndarray.astype(expected_dtype)
+        else:
+            raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
+
+    return tensor.ndarray
+
+
+GGMLCompatibleTensor = Union[UnquantizedTensor]
+
+
+class DeferredPermutedTensor(Tensor):
+    def __init__(self, base: Tensor, n_head: int, n_head_kv: int) -> None:
+        self.base = base
+        self.n_head = n_head
+        self.data_type = self.base.data_type
+
+    def astype(self, data_type: DataType) -> Tensor:
+        return self.base.astype(data_type).permute(self.n_head, self.n_head_kv)
+
+    def to_ggml(self) -> GGMLCompatibleTensor:
+        return self.base.to_ggml().permute(self.n_head, self.n_head_kv)
+
+    def permute(self, n_head: int, n_head_kv: int) -> Tensor:
+        raise Exception("shouldn't permute twice")
+
+
+@dataclass
+class LazyTensor:
+    _load: Callable[[], Tensor]
+    shape: List[int]
+    data_type: DataType
+    description: str
+
+    def load(self) -> Tensor:
+        ret = self._load()
+        assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
+        return ret
+
+    def astype(self, data_type: DataType) -> 'LazyTensor':
+        self.validate_conversion_to(data_type)
+
+        def load() -> Tensor:
+            return self.load().astype(data_type)
+        return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
+
+    def validate_conversion_to(self, data_type: DataType) -> None:
+        if data_type == self.data_type:
+            return
+
+
+LazyModel = Dict[str, LazyTensor]
+
+
+@dataclass
+class ModelPlus:
+    model: LazyModel
+    paths: List[Path]  # Where this was read from.
+    format: Literal['ggml', 'torch', 'safetensors']
+    vocab: Optional[Vocab]  # For GGML models (which have vocab built in), the vocab.
+
+
+def merge_sharded(models: List[LazyModel]) -> LazyModel:
+    # Original LLaMA models have each file contain one part of each tensor.
+    # Use a dict instead of a set to preserve order.
+    names = {name: None for model in models for name in model}
+
+    def convert(name: str) -> LazyTensor:
+        lazy_tensors: List[LazyTensor] = [model[name] for model in models]
+        if len(lazy_tensors) == 1:
+            # only one file; don't go through this procedure since there might
+            # be quantized tensors
+            return lazy_tensors[0]
+        if len(lazy_tensors[0].shape) == 1:
+            # the tensor is just duplicated in every file
+            return lazy_tensors[0]
+        if name.startswith('tok_embeddings.') or \
+           name.endswith('.attention.wo.weight') or \
+           name.endswith('.feed_forward.w2.weight'):
+            # split by columns
+            axis = 1
+        else:
+            # split by rows
+            axis = 0
+        concatenated_shape = list(lazy_tensors[0].shape)
+        concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
+
+        def load() -> UnquantizedTensor:
+            ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
+            concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
+            return UnquantizedTensor(concatenated)
+        description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
+        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
+    return {name: convert(name) for name in names}
+
+
+def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
+    formats = set(mp.format for mp in models_plus)
+    assert len(formats) == 1, "different formats?"
+    format = formats.pop()
+    paths = [path for mp in models_plus for path in mp.paths]
+    # Use the first non-None vocab, if any.
+    try:
+        vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
+    except StopIteration:
+        vocab = None
+
+    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
+        # Transformers models put different tensors in different files, but
+        # don't split indivdual tensors between files.
+        model: LazyModel = {}
+        for mp in models_plus:
+            model.update(mp.model)
+    else:
+        model = merge_sharded([mp.model for mp in models_plus])
+
+    return ModelPlus(model, paths, format, vocab)
+
+
+def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().permute(n_head, n_head_kv)
+    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
+
+def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().permute_part(n_part, n_head)
+    s = lazy_tensor.shape.copy()
+    s[0] = s[0] // 3
+    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
+
+def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().part(n_part)
+    s = lazy_tensor.shape.copy()
+    s[0] = s[0] // 3
+    return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
+
+
+# Functionality that simulates `torch.load` but where individual tensors are
+# only loaded into memory on demand, not all at once.
+# PyTorch can't do this natively as of time of writing:
+# - https://github.com/pytorch/pytorch/issues/64327
+# This allows us to de-shard without multiplying RAM usage, and also
+# conveniently drops the PyTorch dependency (though we still need numpy).
+
+
+@dataclass
+class LazyStorageKind:
+    data_type: DataType
+
+
+@dataclass
+class LazyStorage:
+    load: Callable[[int, int], NDArray]
+    kind: LazyStorageKind
+    description: str
+
+
+class LazyUnpickler(pickle.Unpickler):
+    def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
+        super().__init__(fp)
+        self.data_base_path = data_base_path
+        self.zip_file = zip_file
+
+    def persistent_load(self, pid: Any) -> Any:
+        assert pid[0] == 'storage'
+        assert isinstance(pid[1], LazyStorageKind)
+        data_type = pid[1].data_type
+        filename_stem = pid[2]
+        filename = self.data_base_path + '/' + filename_stem
+        info = self.zip_file.getinfo(filename)
+
+        def load(offset: int, elm_count: int) -> NDArray:
+            dtype = DATA_TYPE_TO_NUMPY.get(data_type)
+            if dtype is None:
+                raise Exception("tensor stored in unsupported format")
+            fp = self.zip_file.open(info)
+            fp.seek(offset * dtype.itemsize)
+            size = elm_count * dtype.itemsize
+            data = fp.read(size)
+            assert len(data) == size
+            return np.frombuffer(data, dtype)
+        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
+        return LazyStorage(load=load, kind=pid[1], description=description)
+
+    # @staticmethod
+    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
+                               # pyright: ignore[reportSelfClsParameterName]
+                               requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
+        assert isinstance(storage, LazyStorage)
+
+        def load() -> UnquantizedTensor:
+            elm_count = stride[0] * size[0]
+            return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
+        description = f'pickled storage_offset={storage_offset} in {storage.description}'
+        return LazyTensor(load, list(size), storage.kind.data_type, description)
+
+    # @staticmethod
+    def rebuild_from_type_v2(func, new_type, args, state):
+        return func(*args)
+
+    CLASSES: Dict[Any, Any] = {
+        ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2,
+        ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
+        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
+        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
+        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
+        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
+        ('torch', 'Tensor'): LazyTensor,
+    }
+
+    def find_class(self, module: str, name: str) -> Any:
+        if not module.startswith('torch'):
+            return super().find_class(module, name)
+        return self.CLASSES[(module, name)]
+
+
+def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
+    zf = zipfile.ZipFile(outer_fp)
+    pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
+    assert len(pickle_paths) == 1, pickle_paths
+    pickle_fp = zf.open(pickle_paths[0], 'r')
+    unpickler = LazyUnpickler(pickle_fp,
+                              data_base_path=pickle_paths[0][:-4],
+                              zip_file=zf)
+    model = unpickler.load()
+    as_dict = dict(model.items())
+    return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
+
+
+def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
+    header_size, = struct.unpack('<Q', fp.read(8))
+    header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
+    # Use mmap for the actual data to avoid race conditions with the file offset.
+    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
+    byte_buf = mapped[8 + header_size:]
+
+    def convert(info: Dict[str, Any]) -> LazyTensor:
+        data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
+        numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
+        shape: List[int] = info['shape']
+        begin, end = info['data_offsets']
+        assert 0 <= begin <= end <= len(byte_buf)
+        assert end - begin == math.prod(shape) * numpy_dtype.itemsize
+        buf = byte_buf[begin:end]
+
+        def load() -> UnquantizedTensor:
+            return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
+        description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
+        return LazyTensor(load, shape, data_type, description)
+    model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
+    return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
+
+
+def must_read(fp: IO[bytes], length: int) -> bytes:
+    ret = fp.read(length)
+    if len(ret) < length:
+        raise Exception("unexpectedly reached end of file")
+    return ret
+
+
+@functools.lru_cache(maxsize=None)
+def lazy_load_file(path: Path) -> ModelPlus:
+    fp = open(path, 'rb')
+    first8 = fp.read(8)
+    fp.seek(0)
+    if first8[:2] == b'PK':
+        # A zip file, i.e. PyTorch format
+        return lazy_load_torch_file(fp, path)
+    elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
+        # Probably safetensors
+        return lazy_load_safetensors_file(fp, path)
+    else:
+        raise ValueError(f"unknown format: {path}")
+
+
+In = TypeVar('In')
+Out = TypeVar('Out')
+
+def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
+    '''Parallel map, but with backpressure.  If the caller doesn't call `next`
+    fast enough, this will stop calling `func` at some point rather than
+    letting results pile up in memory.  Specifically, there is a max of one
+    output value buffered per thread.'''
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures: List[concurrent.futures.Future[Out]] = []
+        items_rev = list(iterable)[::-1]
+        for i in range(min(concurrency, len(items_rev))):
+            futures.append(executor.submit(func, items_rev.pop()))
+        while futures:
+            result = futures.pop(0).result()
+            if items_rev:
+                futures.append(executor.submit(func, items_rev.pop()))
+            yield result
+
+
+def check_vocab_size(params: Params, vocab: Vocab) -> None:
+    if params.n_vocab != vocab.vocab_size:
+        assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
+        if params.n_vocab == vocab.vocab_size_base:
+            print("Ignoring added_tokens.json since model matches vocab size without it.")
+            vocab.added_tokens_list = []
+            vocab.vocab_size = vocab.vocab_size_base
+            return
+        msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
+        if vocab.fname_added_tokens is not None:
+            msg += f" combined with {vocab.fname_added_tokens}"
+        msg += f" has {vocab.vocab_size})."
+        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
+            msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
+        raise Exception(msg)
+
+
+class OutputFile:
+    def __init__(self, fname_out: Path) -> None:
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+    def add_meta_arch(self, params: Params) -> None:
+        self.gguf.add_context_length      (params.n_ctx)
+        self.gguf.add_embedding_length    (params.n_embd)
+        self.gguf.add_block_count         (params.n_layer)
+        self.gguf.add_feed_forward_length (params.n_ff)
+        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
+        self.gguf.add_head_count          (params.n_head)
+        self.gguf.add_head_count_kv       (params.n_head_kv)
+        self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps)
+
+    def add_meta_vocab(self, vocab: Vocab) -> None:
+        tokens = []
+        scores = []
+        for text, score in vocab.all_tokens():
+            tokens.append(text)
+            scores.append(score)
+
+        self.gguf.add_tokenizer_model("llama")
+        self.gguf.add_token_list(tokens)
+        self.gguf.add_token_scores(scores)
+        #self.gguf.add_token_types(toktypes) # TODO: add this
+
+        # TODO: added / special tokens
+
+    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
+        n_elements = 1
+        for dim in tensor.shape:
+            n_elements *= dim
+        data_type = DATA_TYPE_TO_NUMPY[tensor.data_type]
+        data_nbytes = n_elements * data_type.itemsize
+        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes)
+
+    def write_meta(self) -> None:
+        self.gguf.write_header_to_file()
+        self.gguf.write_kv_data_to_file()
+
+    def write_tensor_info(self) -> None:
+        self.gguf.write_ti_data_to_file()
+
+    def close(self) -> None:
+        self.gguf.close()
+
+    @staticmethod
+    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab) -> None:
+        check_vocab_size(params, vocab)
+
+        of = OutputFile(fname_out)
+
+        # meta data
+        of.add_meta_arch(params)
+        of.add_meta_vocab(vocab)
+        of.write_meta()
+
+        of.close()
+
+    @staticmethod
+    def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
+        check_vocab_size(params, vocab)
+
+        of = OutputFile(fname_out)
+
+        # meta data
+        of.add_meta_arch(params)
+        of.add_meta_vocab(vocab)
+
+        # tensor info
+        for name, lazy_tensor in model.items():
+            of.add_tensor_info(name, lazy_tensor)
+
+        of.write_meta()
+        of.write_tensor_info()
+
+        def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
+            name, lazy_tensor = item
+            return lazy_tensor.load().to_ggml().ndarray
+
+        # tensor data
+        ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=1)
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            padi = len(str(len(model)))
+            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
+            of.gguf.write_tensor_data(ndarray)
+
+        of.close()
+
+def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
+    wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
+
+    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
+        return GGMLFileType.AllF32
+    if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
+        return GGMLFileType.MostlyF16
+
+    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
+
+    raise Exception(f"Unexpected combination of types: {name_to_type}")
+
+def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
+    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
+            for (name, tensor) in model.items()}
+
+def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
+    tmap = gguf.get_tensor_name_map(ARCH, params.n_layer)
+
+    tmp = model
+
+    # HF models permut or pack some of the tensors, so we need to undo that
+    for i in itertools.count():
+        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
+            print(f"Permuting layer {i}")
+            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head_kv)
+            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
+           #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
+        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
+            print(f"Unpacking and permuting layer {i}")
+            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head_kv)
+            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
+            tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy        (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
+        else:
+            break
+
+    out: LazyModel = {}
+    for name, lazy_tensor in model.items():
+        name_new = name
+
+        if name in tmap:
+            name_new = tmap[name]
+        elif name.endswith(".weight") and name[:-7] in tmap:
+            name_new = tmap[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tmap:
+            name_new = tmap[name[:-5]] + ".bias"
+        else:
+            raise Exception(f"Unexpected tensor name: {name}")
+
+        if gguf.should_skip_tensor_TMP(ARCH, params.n_layer, name_new):
+            print(f"skipping tensor {name_new}")
+            continue
+        else:
+            print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type} | {lazy_tensor.shape}")
+            out[name_new] = lazy_tensor
+
+    return out
+
+def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
+    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the nth path in the model.
+    '''
+    # Support the following patterns:
+    patterns: List[Tuple[str, str]] = [
+        # - x.00.pth, x.01.pth, etc.
+        (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
+        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
+        (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
+        # x.bin, x.bin.1, etc.
+        (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
+    ]
+    for regex, replacement in patterns:
+        if re.search(regex, path.name):
+            new_path = path.with_name(re.sub(regex, replacement, path.name))
+            if new_path.exists():
+                return new_path
+    return None
+
+
+def find_multifile_paths(path: Path) -> List[Path]:
+    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the whole list of paths in the model.
+    '''
+    ret: List[Path] = []
+    for i in itertools.count():
+        nth_path = nth_multifile_path(path, i)
+        if nth_path is None:
+            break
+        ret.append(nth_path)
+    if not ret:
+        # No matches.  This should only happen if the file was named, e.g.,
+        # foo.0, and there was no file named foo.  Oh well, try to process it
+        # as a single file.
+        return [path]
+    return ret
+
+
+def load_some_model(path: Path) -> ModelPlus:
+    '''Load a model of any supported format.'''
+    # Be extra-friendly and accept either a file or a directory:
+    if path.is_dir():
+        # Check if it's a set of safetensors files first
+        files = list(path.glob("model-00001-of-*.safetensors"))
+        if not files:
+            # Try the PyTorch patterns too, with lower priority
+            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
+            files = [file for glob in globs for file in path.glob(glob)]
+        if not files:
+            raise Exception(f"Can't find model in directory {path}")
+        if len(files) > 1:
+            raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
+        path = files[0]
+
+    paths = find_multifile_paths(path)
+    models_plus: List[ModelPlus] = []
+    for path in paths:
+        print(f"Loading model file {path}")
+        models_plus.append(lazy_load_file(path))
+
+    model_plus = merge_multifile_models(models_plus)
+    return model_plus
+
+
+def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, SentencePieceVocab]:
+    # Be extra-friendly and accept either a file or a directory.  Also, if it's
+    # a directory, it might be the model directory, and tokenizer.model might
+    # be in the parent of that.
+    if path.is_dir():
+        vocab_file = "tokenizer.model"
+        if vocabtype == 'bpe':
+            vocab_file = "vocab.json"
+        path2 = path / vocab_file
+        # Use `.parent` instead of /.. to handle the symlink case better.
+        path3 = path.parent / vocab_file
+        if path2.exists():
+            path = path2
+        elif path3.exists():
+            path = path3
+        else:
+            raise FileNotFoundError(
+                f"Could not find tokenizer.model in {path} or its parent; "
+                "if it's in another directory, pass the directory as --vocab-dir")
+
+    print(f"Loading vocab file '{path}', type '{vocabtype}'")
+
+    added_tokens_path = path.parent / "added_tokens.json"
+    if vocabtype == "bpe":
+        return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+    elif vocabtype == "spm":
+        return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+    else:
+        raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+
+
+def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
+    namestr = {
+        GGMLFileType.AllF32:    "f32",
+        GGMLFileType.MostlyF16: "f16",
+    }[file_type]
+    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
+    if ret in model_paths:
+        sys.stderr.write(
+            f"Error: Default output path ({ret}) would overwrite the input. "
+            "Please explicitly specify a path using --outfile.\n")
+        sys.exit(1)
+    return ret
+
+
+def do_dump_model(model_plus: ModelPlus) -> None:
+    print(f"model_plus.paths = {model_plus.paths!r}")
+    print(f"model_plus.format = {model_plus.format!r}")
+    print(f"model_plus.vocab = {model_plus.vocab!r}")
+    for name, lazy_tensor in model_plus.model.items():
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
+
+
+def main(args_in: Optional[List[str]] = None) -> None:
+    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
+    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
+    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
+    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
+    parser.add_argument("--outtype",     choices=["f32", "f16"], help="output format (default: based on input)")
+    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
+    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
+    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
+    args = parser.parse_args(args_in)
+
+    if args.dump_single:
+        model_plus = lazy_load_file(args.model)
+        do_dump_model(model_plus)
+
+    model_plus = load_some_model(args.model)
+
+    params = Params.load(model_plus)
+    if params.n_ctx == -1:
+        if args.ctx is None:
+            raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
+                            "Please specify one with --ctx:\n"
+                            " - LLaMA v1: --ctx 2048\n"
+                            " - LLaMA v2: --ctx 4096\n")
+        params.n_ctx = args.ctx
+
+    print(f"params = {params}")
+
+    vocab: Vocab
+    if args.vocab_only:
+        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
+        assert args.outfile, "need --outfile if using --vocab-only"
+        outfile = args.outfile
+        OutputFile.write_vocab_only(outfile, params, vocab)
+        print(f"Wrote {outfile}")
+    else:
+        if args.dump:
+            do_dump_model(model_plus)
+            return
+
+        if model_plus.vocab is not None and args.vocab_dir is None:
+            vocab = model_plus.vocab
+        else:
+            vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
+            vocab = load_vocab(vocab_dir, args.vocabtype)
+
+        model       = model_plus.model
+        model       = convert_model_names(model, params)
+        output_type = pick_output_type(model, args.outtype)
+        model       = convert_to_output_type(model, output_type)
+        outfile     = args.outfile or default_outfile(model_plus.paths, output_type)
+
+        OutputFile.write_all(outfile, params, model, vocab)
+        print(f"Wrote {outfile}")
+
+
+if __name__ == '__main__':
+    main()

From 4f92488dd655d1ec768a8030ad1cd2e4b812a315 Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 21 Aug 2023 03:44:16 +0200
Subject: [PATCH 08/25] convert-permute-debug-master.py : permute debug for
 master

---
 convert-permute-debug-master.py | 1327 +++++++++++++++++++++++++++++++
 1 file changed, 1327 insertions(+)
 create mode 100644 convert-permute-debug-master.py

diff --git a/convert-permute-debug-master.py b/convert-permute-debug-master.py
new file mode 100644
index 000000000..7d64b2252
--- /dev/null
+++ b/convert-permute-debug-master.py
@@ -0,0 +1,1327 @@
+#!/usr/bin/env python
+import argparse
+import concurrent.futures
+import copy
+import enum
+import faulthandler
+import functools
+import io
+import itertools
+import json
+import math
+import mmap
+import pickle
+import re
+import signal
+import struct
+import sys
+import zipfile
+from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
+                    Literal, Optional, Sequence, Tuple, TypeVar, Union)
+
+import numpy as np
+from sentencepiece import SentencePieceProcessor  # type: ignore
+
+if TYPE_CHECKING:
+    from typing_extensions import TypeAlias
+
+if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
+    faulthandler.register(signal.SIGUSR1)
+
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+
+@dataclass(frozen=True)
+class UnquantizedDataType:
+    name: str
+
+
+DT_F16 = UnquantizedDataType('F16')
+DT_F32 = UnquantizedDataType('F32')
+DT_I32 = UnquantizedDataType('I32')
+DT_BF16 = UnquantizedDataType('BF16')
+
+
+@dataclass(frozen=True)
+class QuantizedDataType:
+    groupsize: int
+    have_addends: bool
+    have_g_idx: bool
+
+
+DT_Q4_0 = QuantizedDataType(groupsize=32, have_addends=False, have_g_idx=False)
+DT_Q4_1 = QuantizedDataType(groupsize=32, have_addends=True, have_g_idx=False)
+
+DataType = Union[UnquantizedDataType, QuantizedDataType]
+
+DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {
+    DT_F32: 0,
+    DT_F16: 1,
+    DT_Q4_0: 2,
+    DT_Q4_1: 3,
+}
+
+FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \
+    {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()}
+
+DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
+    DT_BF16: np.dtype(np.uint16),
+    DT_F16: np.dtype(np.float16),
+    DT_F32: np.dtype(np.float32),
+    DT_I32: np.dtype(np.int32),
+}
+
+NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
+    {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
+
+
+class GGMLFileType(enum.Enum):
+    AllF32 = 0
+    MostlyF16 = 1  # except 1d tensors
+    MostlyQ4_0 = 2  # except 1d tensors
+    MostlyQ4_1 = 3  # except 1d tensors
+    PerLayerIsQ4_1 = 4  # but tok_embeddings.weight and output.weight are F16
+
+    def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
+        if len(tensor.shape) == 1:
+            # 1D tensors are always F32.
+            return DT_F32
+        elif self == GGMLFileType.AllF32:
+            return DT_F32
+        elif self == GGMLFileType.MostlyF16:
+            return DT_F16
+        elif self == GGMLFileType.MostlyQ4_0:
+            return DT_Q4_0
+        elif self == GGMLFileType.MostlyQ4_1:
+            return DT_Q4_1
+        elif self == GGMLFileType.PerLayerIsQ4_1:
+            if name in ('output.weight', 'tok_embeddings.weight'):
+                return DT_F16
+            else:
+                return DT_Q4_1
+        else:
+            raise ValueError(self)
+
+
+def make_tensors_list() -> List[str]:
+    ret = [
+        'tok_embeddings.weight',
+        'norm.weight',
+        'output.weight',
+    ]
+    for i in range(80):  # maximum number of layer
+        ret += [
+            f'layers.{i}.attention.wq.weight',
+            f'layers.{i}.attention.wk.weight',
+            f'layers.{i}.attention.wv.weight',
+            f'layers.{i}.attention.wo.weight',
+            f'layers.{i}.attention_norm.weight',
+            f'layers.{i}.feed_forward.w1.weight',
+            f'layers.{i}.feed_forward.w2.weight',
+            f'layers.{i}.feed_forward.w3.weight',
+            f'layers.{i}.ffn_norm.weight',
+        ]
+    return ret
+
+
+TENSORS_LIST = make_tensors_list()
+TENSORS_SET = set(TENSORS_LIST)
+
+
+def find_n_mult(n_ff: int, n_embd: int) -> int:
+    # hardcoded magic range
+    for n_mult in range(8192, 1, -1):
+        calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
+        if calc_ff == n_ff:
+            return n_mult
+    raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
+
+@dataclass
+class Params:
+    n_vocab:   int
+    n_embd:    int
+    n_mult:    int
+    n_head:    int
+    n_layer:   int
+    n_kv_head: Optional[int]  # This parameter is only used for Llama 2
+
+    @staticmethod
+    def guessed(model: 'LazyModel') -> 'Params':
+        # try transformer naming first
+        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
+
+        # try transformer naming first
+        if "model.layers.0.self_attn.q_proj.weight" in model:
+            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
+            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
+        else:
+            n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
+
+        if n_layer < 1:
+            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+
+        n_head=n_embd // 128 # guessed
+
+        return Params(
+            n_vocab   = n_vocab,
+            n_embd    = n_embd,
+            n_mult    = 256,
+            n_head    = n_head,
+            n_layer   = n_layer,
+            n_kv_head = None,
+        )
+
+    @staticmethod
+    def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
+        config = json.load(open(config_path))
+
+        n_vocab = config["vocab_size"];
+        n_embd  = config["hidden_size"];
+        n_head  = config["num_attention_heads"];
+        n_layer = config["num_hidden_layers"];
+        n_ff    = config["intermediate_size"];
+        n_kv_head = config.get("num_key_value_heads")
+
+        n_mult = find_n_mult(n_ff, n_embd);
+
+        return Params(
+            n_vocab   = n_vocab,
+            n_embd    = n_embd,
+            n_mult    = n_mult,
+            n_head    = n_head,
+            n_layer   = n_layer,
+            n_kv_head = n_kv_head,
+        )
+
+    # LLaMA v2 70B params.json
+    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
+    @staticmethod
+    def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
+        config = json.load(open(config_path))
+
+        n_vocab   = config["vocab_size"];
+        n_embd    = config["dim"];
+        n_head    = config["n_heads"];
+        n_layer   = config["n_layers"];
+        n_mult    = config["multiple_of"];
+
+        if n_vocab == -1:
+            n_vocab = model["tok_embeddings.weight"].shape[0]
+
+        return Params(
+            n_vocab   = n_vocab,
+            n_embd    = n_embd,
+            n_mult    = n_mult,
+            n_head    = n_head,
+            n_layer   = n_layer,
+            n_kv_head = None,
+        )
+
+    @staticmethod
+    def load(model_plus: 'ModelPlus') -> 'Params':
+        hf_config_path   = model_plus.paths[0].parent / "config.json"
+        orig_config_path = model_plus.paths[0].parent / "params.json"
+
+        if hf_config_path.exists():
+            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
+        elif orig_config_path.exists():
+            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
+        else:
+            params = Params.guessed(model_plus.model)
+
+        print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
+        return params
+
+
+class SentencePieceVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
+        self.vocabtype = vocabtype
+        if self.vocabtype == "bpe":
+          self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
+        else:
+          self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+        added_tokens: Dict[str, int]
+        if fname_added_tokens is not None:
+            added_tokens = json.load(open(fname_added_tokens))
+        else:
+            added_tokens = {}
+        if self.vocabtype == "bpe":
+          vocab_size: int = len(self.sentencepiece_tokenizer)
+        else:
+          vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_list = [text for (text, idx) in items]
+        self.vocab_size_base: int = vocab_size
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens
+
+    def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        tokenizer = self.sentencepiece_tokenizer
+        if self.vocabtype == "bpe":
+          from transformers.models.gpt2 import tokenization_gpt2
+          byte_encoder = tokenization_gpt2.bytes_to_unicode()
+          byte_decoder = {v: k for k, v in byte_encoder.items()}
+          for i, item in enumerate(tokenizer):
+            text: bytes
+            text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
+            score: float = -i
+            yield text, score
+        else:
+          for i in range(tokenizer.vocab_size()):
+              text: bytes
+              if tokenizer.is_unknown(i):
+                  text = " \u2047 ".encode("utf-8")
+              elif tokenizer.is_control(i):
+                  text = b""
+              elif tokenizer.is_byte(i):
+                  piece = tokenizer.id_to_piece(i)
+                  if len(piece) != 6:
+                      raise Exception(f"Invalid token: {piece}")
+                  byte_value = int(piece[3:-1], 16)
+                  text = struct.pack("B", byte_value)
+              else:
+                  text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+              score: float = tokenizer.get_score(i)
+              yield text, score
+
+    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        yield from self.sentencepiece_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class GGMLVocab:
+    def __init__(self, tokens: List[Tuple[bytes, float]]):
+        self.tokens = tokens
+        self.vocab_size = len(tokens)
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        return self.tokens
+
+    def __repr__(self) -> str:
+        return f"<GGMLVocab with {self.vocab_size} tokens>"
+
+
+Vocab = Union[SentencePieceVocab, GGMLVocab]
+
+
+def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
+    print( "permute " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
+    if n_kv_head is not None and n_head != n_kv_head:
+        n_head //= n_kv_head
+    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+                .swapaxes(1, 2)
+                .reshape(weights.shape))
+
+
+def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray:
+    # First reinterpret each row from a list of int32s containing 8 values each
+    # to a list of uint8s containing 2 values each.
+    qvalues_pack8 = qvalues_pack32.view(np.uint8)
+
+    # Then split out the two values per int8 (which requires an actual
+    # conversion because numpy doesn't natively support int4s).
+    qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8)
+    qvalues[:, 0::2] = qvalues_pack8 & 0xf
+    qvalues[:, 1::2] = qvalues_pack8 >> 4
+
+    assert addends is None or addends.shape == scales.shape
+    assert qvalues.shape[0] == scales.shape[0]
+    assert qvalues.shape[1] % scales.shape[1] == 0
+    if g_idx is None:
+        repeat_count = qvalues.shape[1] // scales.shape[1]
+        scales = scales[:, :, np.newaxis]
+        if addends is not None:
+            addends = addends[:, :, np.newaxis]
+        # Reshape so that the below computation broadcasts over scales and addends:
+        qvalues.shape = (qvalues.shape[0], scales.shape[1], int(repeat_count))
+    else:
+        # In this case the scale and addend is selected for each column by g_idx:
+        assert addends is not None
+        scales = scales[:, g_idx]
+        addends = addends[:, g_idx]
+    if addends is None:
+        # Q4_0
+        qvalues = qvalues.view(np.int8)
+        qvalues -= 8
+    # And do the actual 'value = scale * qvalue + addend' computation.
+    values = scales * qvalues
+    if addends is not None:
+        values += addends
+    if g_idx is None:
+        values.shape = (values.shape[0], values.shape[1] * values.shape[2])
+    return values
+
+
+class Tensor(metaclass=ABCMeta):
+    data_type: DataType
+
+    @abstractmethod
+    def astype(self, data_type: DataType) -> 'Tensor': ...
+    @abstractmethod
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ...
+    @abstractmethod
+    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
+    @abstractmethod
+    def part(self, n_part: int) -> 'UnquantizedTensor': ...
+    @abstractmethod
+    def to_ggml(self) -> 'GGMLCompatibleTensor': ...
+
+
+def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
+    assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
+    fp32_arr = bf16_arr.astype(np.uint32) << 16
+    return fp32_arr.view(np.float32)
+
+
+class UnquantizedTensor(Tensor):
+    def __init__(self, ndarray: NDArray) -> None:
+        assert isinstance(ndarray, np.ndarray)
+        self.ndarray = ndarray
+        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
+
+    def astype(self, data_type: DataType) -> Tensor:
+        dtype = DATA_TYPE_TO_NUMPY[data_type]
+        if self.data_type == DT_BF16:
+            self.ndarray = bf16_to_fp32(self.ndarray)
+        return UnquantizedTensor(self.ndarray.astype(dtype))
+
+    def to_ggml(self) -> 'UnquantizedTensor':
+        return self
+
+    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
+        r = self.ndarray.shape[0] // 3
+        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
+
+    def part(self, n_part: int) -> 'UnquantizedTensor':
+        r = self.ndarray.shape[0] // 3
+        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
+
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor':
+        return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head))
+
+
+def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
+    tensor = lazy_tensor.load()
+    assert isinstance(tensor, UnquantizedTensor)
+
+    # double-check:
+    actual_shape = list(tensor.ndarray.shape)
+    assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
+    if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
+        if convert:
+            tensor.ndarray = tensor.ndarray.astype(expected_dtype)
+        else:
+            raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
+
+    return tensor.ndarray
+
+
+class GGMLQuantizedTensor(Tensor):
+    data_type: QuantizedDataType
+
+    def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None:
+        rows, columns = shape
+        assert data_type in (DT_Q4_1, DT_Q4_0)  # for now
+        assert isinstance(data_type, QuantizedDataType)  # redundant, but mypy complains without this
+        assert columns % data_type.groupsize == 0
+        words_in_block = 6 if data_type == DT_Q4_1 else 5
+        self.ndarray = ndarray.view(dtype=np.uint32).reshape((rows, columns // data_type.groupsize, words_in_block))
+        self.shape = shape[:]
+        self.data_type = data_type
+
+    def astype(self, data_type: DataType) -> Tensor:
+        if data_type == self.data_type:
+            return self
+        scales = self.ndarray[:, :, 0].view(np.float32)
+        if self.data_type.have_addends:
+            addends = self.ndarray[:, :, 1].view(np.float32)
+        else:
+            addends = None
+        qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8])
+
+        dq = dequantize_q4(qweights, scales, addends, g_idx=None)
+        return UnquantizedTensor(dq).astype(data_type)
+
+    def to_ggml(self) -> 'GGMLQuantizedTensor':
+        return self
+
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor':
+        return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type)
+
+    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
+        r = self.ndarray.shape[0] // 3
+        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
+
+    def part(self, n_part: int) -> 'UnquantizedTensor':
+        r = self.ndarray.shape[0] // 3
+        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
+
+GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
+
+
+class DeferredPermutedTensor(Tensor):
+    def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
+        self.base = base
+        self.n_head = n_head
+        self.n_kv_head = n_kv_head
+        self.data_type = self.base.data_type
+
+    def astype(self, data_type: DataType) -> Tensor:
+        return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)
+
+    def to_ggml(self) -> GGMLCompatibleTensor:
+        return self.base.to_ggml().permute(self.n_head, self.n_kv_head)
+
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
+        raise Exception("shouldn't permute twice")
+
+
+class GPTQForLLaMaQuantizedTensor(Tensor):
+    def __init__(self, model: 'LazyModel', namebase: str) -> None:
+        qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32)
+        scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True)
+
+        bias = model.get(f"{namebase}.bias")
+        if bias is not None:
+            # Q4_1 does not support bias; good thing the bias is always all zeros.
+            assert not np.any(load_unquantized(bias))
+
+        if f"{namebase}.zeros" in model:
+            zeros = load_unquantized(model[f"{namebase}.zeros"], np.float32)
+        else:
+            qzeros = load_unquantized(model[f"{namebase}.qzeros"], np.int32)
+            assert qzeros.dtype == np.int32
+            zeros = dequantize_q4(qzeros, scales, scales, g_idx=None)
+            assert zeros.dtype == np.float32
+
+        assert zeros.shape == scales.shape
+
+        # Output is transposed compared to the input, and addends have their sign flipped.
+        # Scales and zeros similarly must be transposed but only for newer
+        # versions of GPTQ-for-LLaMa; the older versions can be identified by
+        # having shape (n_embd, 1).
+        qweight = qweight.T
+        if scales.shape[1] != 1:
+            scales = scales.T
+            zeros = zeros.T
+
+        # Output also has signs flipped for the addends.
+        self.qweight = qweight
+        self.scales = scales
+        self.addends = -zeros
+
+        self.g_idx: Optional[NDArray]
+        if f"{namebase}.g_idx" in model:
+            self.g_idx = load_unquantized(model[f"{namebase}.g_idx"], np.int32)
+            assert self.g_idx.shape == (qweight.shape[1] * 8,)
+        else:
+            self.g_idx = None
+
+        self.shape = [self.qweight.shape[0], self.qweight.shape[1] * 8]
+        self.data_type = QuantizedDataType(groupsize=self.groupsize(), have_addends=True,
+                                           have_g_idx=(self.g_idx is not None))
+
+    def inspect(self, row: int, col: int) -> None:
+        '''For debugging.'''
+        qweight = (self.qweight[row, col // 8] >> (4 * (col & 7))) & 0xf
+        if self.g_idx is not None:
+            group = self.g_idx[col]
+        else:
+            group = int(col // self.groupsize())
+        scale = self.scales[row, group]
+        addend = self.addends[row, group]
+        with np.printoptions(precision=None, suppress=True):
+            print(f'scale:{scale} addend:{addend} qweight:{qweight}')
+            print('possible values:', np.arange(16) * scale + addend)
+            print('actual value:', qweight * scale + addend)
+
+    def astype(self, data_type: DataType) -> Tensor:
+        if isinstance(data_type, QuantizedDataType):
+            assert self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False
+            return self.regroup(data_type.groupsize)
+
+        dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx)
+        return UnquantizedTensor(dequantized).astype(data_type)
+
+    def groupsize(self) -> int:
+        assert self.addends.shape == self.scales.shape
+        assert self.shape[1] % self.scales.shape[1] == 0
+        return self.shape[1] // self.scales.shape[1]
+
+    def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor':
+        # Old versions of GPTQ-for-LLaMa shared scales and addends between all the
+        # columns in a row.  Newer versions share them between every set of N
+        # columns in a row, where N is the `groupsize` parameter, usually 128.  The
+        # output format shares them between every set of 32 columns.  To handle
+        # this, duplicate scales and addends for every smaller group.
+        # (In the above, 'row' and 'column' are in the sense of the output.)
+        assert self.g_idx is None
+        old_groupsize = self.groupsize()
+        assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize
+        ret = copy.copy(self)
+        ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1)
+        ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1)
+        ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
+        return ret
+
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
+        return DeferredPermutedTensor(self, n_head, n_kv_head)
+
+    def to_ggml(self) -> GGMLQuantizedTensor:
+        # The output format looks like this:
+        # For each row:
+        #   For each group of 32 columns:
+        #     - addend (float32, 4 bytes)
+        #     - scale (float32, 4 bytes)
+        #     - weights (int4 * 32, 16 bytes)
+
+        if self.groupsize() != 32:
+            raise Exception("should have been regrouped before converting to ggml")
+
+        # Since the output format is mixed between integers and floats, we have
+        # to hackily view the floats as int32s just so numpy will let us
+        # concatenate them.
+        addends_view = self.addends.view(dtype=np.int32)[:, :, np.newaxis]
+        scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis]
+
+        # Split into groups of 4 columns (i.e. 32 columns of quantized data):
+        grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4])
+
+        # And concatenate:
+        grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting='no')
+
+        return GGMLQuantizedTensor(grouped, self.shape, DT_Q4_1)
+
+
+@dataclass
+class LazyTensor:
+    _load: Callable[[], Tensor]
+    shape: List[int]
+    data_type: DataType
+    description: str
+
+    def load(self) -> Tensor:
+        ret = self._load()
+        assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
+        return ret
+
+    def astype(self, data_type: DataType) -> 'LazyTensor':
+        self.validate_conversion_to(data_type)
+
+        def load() -> Tensor:
+            return self.load().astype(data_type)
+        return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
+
+    def validate_conversion_to(self, data_type: DataType) -> None:
+        if data_type == self.data_type:
+            return
+        if isinstance(data_type, QuantizedDataType):
+            if not isinstance(self.data_type, QuantizedDataType):
+                raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
+            if self.data_type.have_g_idx:
+                sys.stderr.write(
+                    "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), "
+                    "which is not yet natively supported by GGML. "
+                    "For now you can still convert this model by passing `--outtype f16` to dequantize, "
+                    "but that will result in a much larger output file for no quality benefit.\n")
+                sys.exit(1)
+            assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
+
+
+LazyModel = Dict[str, LazyTensor]
+
+
+@dataclass
+class ModelPlus:
+    model: LazyModel
+    paths: List[Path]  # Where this was read from.
+    format: Literal['ggml', 'torch', 'safetensors']
+    vocab: Optional[Vocab]  # For GGML models (which have vocab built in), the vocab.
+
+
+def merge_sharded(models: List[LazyModel]) -> LazyModel:
+    # Original LLaMA models have each file contain one part of each tensor.
+    # Use a dict instead of a set to preserve order.
+    names = {name: None for model in models for name in model}
+
+    def convert(name: str) -> LazyTensor:
+        lazy_tensors: List[LazyTensor] = [model[name] for model in models]
+        if len(lazy_tensors) == 1:
+            # only one file; don't go through this procedure since there might
+            # be quantized tensors
+            return lazy_tensors[0]
+        if len(lazy_tensors[0].shape) == 1:
+            # the tensor is just duplicated in every file
+            return lazy_tensors[0]
+        if name.startswith('tok_embeddings.') or \
+           name.endswith('.attention.wo.weight') or \
+           name.endswith('.feed_forward.w2.weight'):
+            # split by columns
+            axis = 1
+        else:
+            # split by rows
+            axis = 0
+        concatenated_shape = list(lazy_tensors[0].shape)
+        concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
+
+        def load() -> UnquantizedTensor:
+            ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
+            concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
+            return UnquantizedTensor(concatenated)
+        description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
+        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
+    return {name: convert(name) for name in names}
+
+
+def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
+    formats = set(mp.format for mp in models_plus)
+    assert len(formats) == 1, "different formats?"
+    format = formats.pop()
+    paths = [path for mp in models_plus for path in mp.paths]
+    # Use the first non-None vocab, if any.
+    try:
+        vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
+    except StopIteration:
+        vocab = None
+
+    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
+        # Transformers models put different tensors in different files, but
+        # don't split indivdual tensors between files.
+        model: LazyModel = {}
+        for mp in models_plus:
+            model.update(mp.model)
+    else:
+        model = merge_sharded([mp.model for mp in models_plus])
+
+    return ModelPlus(model, paths, format, vocab)
+
+
+def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().permute(n_head, n_kv_head)
+    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description)
+
+def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().permute_part(n_part, n_head)
+    s = lazy_tensor.shape.copy()
+    s[0] = s[0] // 3
+    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
+
+def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().part(n_part)
+    s = lazy_tensor.shape.copy()
+    s[0] = s[0] // 3
+    return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
+
+def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
+    out: LazyModel = {}
+    out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
+    out["norm.weight"] = model["model.norm.weight"]
+    out["output.weight"] = model["lm_head.weight"]
+
+    for i in itertools.count():
+        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
+            out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
+            out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
+            out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
+        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
+            out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
+            out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
+            out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
+        else:
+            break
+
+        out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
+
+        out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
+        out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"]
+        out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"]
+
+        out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"]
+        out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"]
+    return out
+
+
+def handle_quantization(model: LazyModel) -> LazyModel:
+    '''Convert a model with entries for 'foo.qweight', 'foo.scales', etc.
+    (which resolve to UnquantizedTensors with the raw data) to one with entries
+    for 'foo.weight' (which resolve to QuantizedTensors).
+    '''
+    def convert(name: str) -> Tuple[str, LazyTensor]:
+        if name.endswith(".qweight"):
+            namebase = name.rsplit('.', 1)[0]
+            orig_name = namebase + ".weight"
+
+            lazy_tensor = model[name]
+            assert len(lazy_tensor.shape) == 2
+            real_shape = [lazy_tensor.shape[1], lazy_tensor.shape[0] * 8]
+
+            # Calculate type.  This replicates the logic in
+            # GPTQForLLaMaQuantizedTensor (which is executed when the modelis
+            # actually loaded).
+            lazy_scales = model[f"{namebase}.scales"]
+            scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0]
+            assert real_shape[1] % scales_width == 0
+            groupsize = real_shape[1] // scales_width
+            have_g_idx = f"{namebase}.g_idx" in model
+            data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx)
+
+            def load() -> Tensor:
+                return GPTQForLLaMaQuantizedTensor(model, namebase)
+
+            return (orig_name, LazyTensor(load, real_shape, data_type, '[quantized]'))
+        else:
+            return (name, model[name])
+    return dict(convert(name) for name in model)
+
+# Functionality that simulates `torch.load` but where individual tensors are
+# only loaded into memory on demand, not all at once.
+# PyTorch can't do this natively as of time of writing:
+# - https://github.com/pytorch/pytorch/issues/64327
+# This allows us to de-shard without multiplying RAM usage, and also
+# conveniently drops the PyTorch dependency (though we still need numpy).
+
+
+@dataclass
+class LazyStorageKind:
+    data_type: DataType
+
+
+@dataclass
+class LazyStorage:
+    load: Callable[[int, int], NDArray]
+    kind: LazyStorageKind
+    description: str
+
+
+class LazyUnpickler(pickle.Unpickler):
+    def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
+        super().__init__(fp)
+        self.data_base_path = data_base_path
+        self.zip_file = zip_file
+
+    def persistent_load(self, pid: Any) -> Any:
+        assert pid[0] == 'storage'
+        assert isinstance(pid[1], LazyStorageKind)
+        data_type = pid[1].data_type
+        filename_stem = pid[2]
+        filename = self.data_base_path + '/' + filename_stem
+        info = self.zip_file.getinfo(filename)
+
+        def load(offset: int, elm_count: int) -> NDArray:
+            dtype = DATA_TYPE_TO_NUMPY.get(data_type)
+            if dtype is None:
+                raise Exception("tensor stored in unsupported format")
+            fp = self.zip_file.open(info)
+            fp.seek(offset * dtype.itemsize)
+            size = elm_count * dtype.itemsize
+            data = fp.read(size)
+            assert len(data) == size
+            return np.frombuffer(data, dtype)
+        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
+        return LazyStorage(load=load, kind=pid[1], description=description)
+
+    # @staticmethod
+    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
+                               # pyright: ignore[reportSelfClsParameterName]
+                               requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
+        assert isinstance(storage, LazyStorage)
+
+        def load() -> UnquantizedTensor:
+            elm_count = stride[0] * size[0]
+            return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
+        description = f'pickled storage_offset={storage_offset} in {storage.description}'
+        return LazyTensor(load, list(size), storage.kind.data_type, description)
+
+    # @staticmethod
+    def rebuild_from_type_v2(func, new_type, args, state):
+        return func(*args)
+
+    CLASSES: Dict[Any, Any] = {
+        ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2,
+        ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
+        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
+        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
+        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
+        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
+        ('torch', 'Tensor'): LazyTensor,
+    }
+
+    def find_class(self, module: str, name: str) -> Any:
+        if not module.startswith('torch'):
+            return super().find_class(module, name)
+        return self.CLASSES[(module, name)]
+
+
+def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
+    zf = zipfile.ZipFile(outer_fp)
+    pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
+    assert len(pickle_paths) == 1, pickle_paths
+    pickle_fp = zf.open(pickle_paths[0], 'r')
+    unpickler = LazyUnpickler(pickle_fp,
+                              data_base_path=pickle_paths[0][:-4],
+                              zip_file=zf)
+    model = unpickler.load()
+    as_dict = dict(model.items())
+    return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
+
+
+SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
+    'BF16': DT_BF16,
+    'F16': DT_F16,
+    'F32': DT_F32,
+    'I32': DT_I32,
+}
+
+
+def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
+    header_size, = struct.unpack('<Q', fp.read(8))
+    header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
+    # Use mmap for the actual data to avoid race conditions with the file offset.
+    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
+    byte_buf = mapped[8 + header_size:]
+
+    def convert(info: Dict[str, Any]) -> LazyTensor:
+        data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
+        numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
+        shape: List[int] = info['shape']
+        begin, end = info['data_offsets']
+        assert 0 <= begin <= end <= len(byte_buf)
+        assert end - begin == math.prod(shape) * numpy_dtype.itemsize
+        buf = byte_buf[begin:end]
+
+        def load() -> UnquantizedTensor:
+            return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
+        description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
+        return LazyTensor(load, shape, data_type, description)
+    model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
+    return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
+
+
+def must_read(fp: IO[bytes], length: int) -> bytes:
+    ret = fp.read(length)
+    if len(ret) < length:
+        raise Exception("unexpectedly reached end of file")
+    return ret
+
+
+def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
+    magic = must_read(fp, 4)[::-1]
+    if magic in (b'ggmf', b'ggjt'):
+        version, = struct.unpack("i", must_read(fp, 4))
+        assert version == 1
+    else:
+        assert magic == b'ggml'
+        version = None
+    n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack('<7i', must_read(fp, 28))
+
+    tokens: List[Tuple[bytes, float]] = []
+    for i in range(n_vocab):
+        if i == 32000:
+            # HACK: GPT4All messed with the format without changing the magic
+            # number.  Specifically, they changed the vocab section to contain
+            # `n_vocab - 1` tokens instead of `n_vocab` (i.e. omitting the
+            # extra pad token).  Try to detect if we're reading a file like
+            # this.
+            orig_pos = fp.tell()
+            fp.seek(20, io.SEEK_CUR)
+            is_gpt4all = fp.read(21) == b'tok_embeddings.weight'
+            fp.seek(orig_pos)
+            if is_gpt4all:
+                break
+
+        length, = struct.unpack("i", must_read(fp, 4))
+        text = must_read(fp, length)
+        if magic != b'ggml':
+            score, = struct.unpack("f", must_read(fp, 4))
+            tokens.append((text, score))
+    vocab = GGMLVocab(tokens) if magic != b'ggml' else None
+
+    model: LazyModel = {}
+    # Use mmap for the actual data to avoid race conditions with the file offset.
+    off = fp.raw.tell()
+    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
+    fp.raw.seek(off)  # needed on Windows
+
+    def read_tensor() -> None:  # this is a function so that variables captured in `load` don't change
+        shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
+        assert 0 <= shape_len <= 3
+        shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len)))
+        shape = shape[::-1]
+        name = must_read(fp, name_len).decode('utf-8')
+        data_type = FTYPE_TO_DATA_TYPE[ftype]
+
+        if magic == b'ggjt':
+            fp.seek((fp.tell() + 31) & -32)
+
+        if data_type == DT_Q4_1:
+            # See GPTQForLLaMaQuantizedTensor.ggml_ndarray()
+            size = 24 * (shape[1] // 32) * shape[0]
+        elif data_type == DT_Q4_0:
+            size = 20 * (shape[1] // 32) * shape[0]
+        else:
+            numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
+            elm_count = math.prod(shape)
+            size = elm_count * numpy_dtype.itemsize
+        offset = fp.tell()
+        buf = mapped[offset:offset+size]
+        fp.seek(size, io.SEEK_CUR)
+
+        def load() -> Tensor:
+            if isinstance(data_type, QuantizedDataType):
+                ndarray = np.frombuffer(buf, dtype=np.uint32)
+                return GGMLQuantizedTensor(ndarray, shape, data_type)
+            else:
+                return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
+        description = f'ggml offset={offset} type={data_type} path={path}'
+        model[name] = LazyTensor(load, shape, data_type, description)
+
+    while fp.read(1) != b'':
+        fp.seek(-1, io.SEEK_CUR)
+        read_tensor()
+
+    return ModelPlus(model=model, paths=[path], format='ggml', vocab=vocab)
+
+
+@functools.lru_cache(maxsize=None)
+def lazy_load_file(path: Path) -> ModelPlus:
+    fp = open(path, 'rb')
+    first8 = fp.read(8)
+    fp.seek(0)
+    if first8[:2] == b'PK':
+        # A zip file, i.e. PyTorch format
+        return lazy_load_torch_file(fp, path)
+    elif first8[2:4] == b'gg':
+        # GGML format
+        return lazy_load_ggml_file(fp, path)
+    elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
+        # Probably safetensors
+        return lazy_load_safetensors_file(fp, path)
+    else:
+        raise ValueError(f"unknown format: {path}")
+
+
+In = TypeVar('In')
+Out = TypeVar('Out')
+
+
+def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
+    '''Parallel map, but with backpressure.  If the caller doesn't call `next`
+    fast enough, this will stop calling `func` at some point rather than
+    letting results pile up in memory.  Specifically, there is a max of one
+    output value buffered per thread.'''
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures: List[concurrent.futures.Future[Out]] = []
+        items_rev = list(iterable)[::-1]
+        for i in range(min(concurrency, len(items_rev))):
+            futures.append(executor.submit(func, items_rev.pop()))
+        while futures:
+            result = futures.pop(0).result()
+            if items_rev:
+                futures.append(executor.submit(func, items_rev.pop()))
+            yield result
+
+
+def check_vocab_size(params: Params, vocab: Vocab) -> None:
+    if params.n_vocab != vocab.vocab_size:
+        # GGMLVocab comes from the same file as the model so shouldn't mismatch:
+        assert isinstance(vocab, SentencePieceVocab)
+        if params.n_vocab == vocab.vocab_size_base:
+            print("Ignoring added_tokens.json since model matches vocab size without it.")
+            vocab.added_tokens_list = []
+            vocab.vocab_size = vocab.vocab_size_base
+            return
+        msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
+        if vocab.fname_added_tokens is not None:
+            msg += f" combined with {vocab.fname_added_tokens}"
+        msg += f" has {vocab.vocab_size})."
+        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
+            msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
+        raise Exception(msg)
+
+
+class OutputFile:
+    def __init__(self, fname_out: Path) -> None:
+        self.fout = open(fname_out, "wb")
+
+    def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
+        self.fout.write(b"ggjt"[::-1])  # magic
+        values = [
+            1,  # file version
+            params.n_vocab,
+            params.n_embd,
+            params.n_mult,
+            params.n_head,
+            params.n_layer,
+            params.n_embd // params.n_head,  # rot (obsolete)
+            file_type.value,
+        ]
+        self.fout.write(struct.pack("i" * len(values), *values))
+
+    def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
+        sname = name.encode('utf-8')
+        self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type]))
+        self.fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+        self.fout.write(sname)
+        self.fout.seek((self.fout.tell() + 31) & -32)
+
+    def write_vocab(self, vocab: Vocab) -> None:
+        for text, score in vocab.all_tokens():
+            self.fout.write(struct.pack("i", len(text)))
+            self.fout.write(text)
+            self.fout.write(struct.pack("f", score))
+
+    @staticmethod
+    def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
+        of = OutputFile(fname_out)
+        params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
+        of = OutputFile(fname_out)
+        of.write_file_header(params, file_type=GGMLFileType.AllF32)
+        of.write_vocab(vocab)
+        of.fout.close()
+
+    @staticmethod
+    def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
+        check_vocab_size(params, vocab)
+        of = OutputFile(fname_out)
+        of.write_file_header(params, file_type)
+        print("Writing vocab...")
+        of.write_vocab(vocab)
+
+        def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
+            name, lazy_tensor = item
+            return lazy_tensor.load().to_ggml().ndarray
+
+        ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=1)
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            padi = len(str(len(model)))
+            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
+            of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type)
+            ndarray.tofile(of.fout)
+        of.fout.close()
+
+
+def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
+    wq_type = model["layers.0.attention.wq.weight"].data_type
+    if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
+        return GGMLFileType.AllF32
+    if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
+        return GGMLFileType.MostlyF16
+    if output_type_str == "q4_1" or (output_type_str is None and isinstance(wq_type, QuantizedDataType) and
+                                     wq_type.have_addends):
+        if isinstance(model["output.weight"].data_type, QuantizedDataType):
+            return GGMLFileType.MostlyQ4_1
+        else:
+            return GGMLFileType.PerLayerIsQ4_1
+    if output_type_str == "q4_0" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)):
+        return GGMLFileType.MostlyQ4_0
+    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
+    raise Exception(f"Unexpected combination of types: {name_to_type}")
+
+
+def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
+    model = handle_quantization(model)
+
+    if "lm_head.weight" in model:
+        model = convert_transformers_to_orig(model, params)
+    model = filter_and_sort_tensors(model)
+
+    return model
+
+
+def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
+    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
+            for (name, tensor) in model.items()}
+
+
+def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
+    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the nth path in the model.
+    '''
+    # Support the following patterns:
+    patterns: List[Tuple[str, str]] = [
+        # - x.00.pth, x.01.pth, etc.
+        (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
+        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
+        (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
+        # x.bin, x.bin.1, etc.
+        (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
+    ]
+    for regex, replacement in patterns:
+        if re.search(regex, path.name):
+            new_path = path.with_name(re.sub(regex, replacement, path.name))
+            if new_path.exists():
+                return new_path
+    return None
+
+
+def find_multifile_paths(path: Path) -> List[Path]:
+    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the whole list of paths in the model.
+    '''
+    ret: List[Path] = []
+    for i in itertools.count():
+        nth_path = nth_multifile_path(path, i)
+        if nth_path is None:
+            break
+        ret.append(nth_path)
+    if not ret:
+        # No matches.  This should only happen if the file was named, e.g.,
+        # foo.0, and there was no file named foo.  Oh well, try to process it
+        # as a single file.
+        return [path]
+    return ret
+
+
+def load_some_model(path: Path) -> ModelPlus:
+    '''Load a model of any supported format.'''
+    # Be extra-friendly and accept either a file or a directory:
+    if path.is_dir():
+        # Check if it's a set of safetensors files first
+        files = list(path.glob("model-00001-of-*.safetensors"))
+        if not files:
+            # Try the PyTorch patterns too, with lower priority
+            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
+            files = [file for glob in globs for file in path.glob(glob)]
+        if not files:
+            # Try GGML too, but with lower priority, since if both a non-GGML
+            # model and a GGML model exist in the same directory, we assume the
+            # latter was converted from the former.
+            files = list(path.glob("ggml-model*.bin*"))
+        if not files:
+            raise Exception(f"Can't find model in directory {path}")
+        if len(files) > 1:
+            raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
+        path = files[0]
+
+    paths = find_multifile_paths(path)
+    models_plus: List[ModelPlus] = []
+    for path in paths:
+        print(f"Loading model file {path}")
+        models_plus.append(lazy_load_file(path))
+
+    model_plus = merge_multifile_models(models_plus)
+    return model_plus
+
+
+def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
+    return {name: model[name] for name in TENSORS_LIST if name in model}
+
+
+def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
+    print(f"vocabtype: {vocabtype}")
+    # Be extra-friendly and accept either a file or a directory.  Also, if it's
+    # a directory, it might be the model directory, and tokenizer.model might
+    # be in the parent of that.
+    if path.is_dir():
+        vocab_file = "tokenizer.model"
+        if vocabtype == 'bpe':
+          vocab_file = "vocab.json"
+        path2 = path / vocab_file
+        # Use `.parent` instead of /.. to handle the symlink case better.
+        path3 = path.parent / vocab_file
+        if path2.exists():
+            path = path2
+        elif path3.exists():
+            path = path3
+        else:
+            raise FileNotFoundError(
+                f"Could not find tokenizer.model in {path} or its parent; "
+                "if it's in another directory, pass the directory as --vocab-dir")
+    added_tokens_path = path.parent / "added_tokens.json"
+    print(f"Loading vocab file {path}")
+    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
+                              vocabtype)
+
+
+def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
+    namestr = {
+        GGMLFileType.AllF32: "f32",
+        GGMLFileType.MostlyF16: "f16",
+        GGMLFileType.MostlyQ4_0: "q4_0",
+        GGMLFileType.MostlyQ4_1: "q4_1",
+        GGMLFileType.PerLayerIsQ4_1: "q4_1",
+    }[file_type]
+    ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
+    if ret in model_paths:
+        sys.stderr.write(
+            f"Error: Default output path ({ret}) would overwrite the input. "
+            "Please explicitly specify a path using --outfile.\n")
+        sys.exit(1)
+    return ret
+
+
+def do_dump_model(model_plus: ModelPlus) -> None:
+    print(f"model_plus.paths = {model_plus.paths!r}")
+    print(f"model_plus.format = {model_plus.format!r}")
+    print(f"model_plus.vocab = {model_plus.vocab!r}")
+    for name, lazy_tensor in model_plus.model.items():
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
+
+
+def main(args_in: Optional[List[str]] = None) -> None:
+    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
+    parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
+    parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
+    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
+    parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
+    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
+    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("model", type=Path,
+                        help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("--vocabtype", default='spm', choices=["spm", "bpe"], help="vocab format (default: spm)")
+    args = parser.parse_args(args_in)
+
+    vocab: Vocab
+    if args.dump_single:
+        model_plus = lazy_load_file(args.model)
+        do_dump_model(model_plus)
+    elif args.vocab_only:
+        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
+        assert args.outfile, "need --outfile if using --vocab-only"
+        outfile = args.outfile
+        OutputFile.write_vocab_only(outfile, vocab)
+        print(f"Wrote {outfile}")
+    else:
+        model_plus = load_some_model(args.model)
+        if args.dump:
+            do_dump_model(model_plus)
+            return
+        if model_plus.vocab is not None and args.vocab_dir is None:
+            vocab = model_plus.vocab
+        else:
+            vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
+            vocab = load_vocab(vocab_dir, args.vocabtype)
+        params = Params.load(model_plus)
+        model = model_plus.model
+        model = do_necessary_conversions(model, params)
+        output_type = pick_output_type(model, args.outtype)
+        model = convert_to_output_type(model, output_type)
+        outfile = args.outfile or default_outfile(model_plus.paths, output_type)
+        OutputFile.write_all(outfile, params, output_type, model, vocab)
+        print(f"Wrote {outfile}")
+
+
+if __name__ == '__main__':
+    main()

From 7de7cb4bd86fb2d48391246ab9646a05fc05e556 Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 21 Aug 2023 04:06:59 +0200
Subject: [PATCH 09/25] convert-permute-debug.py : change permute type of
 attn_q

---
 convert-permute-debug.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/convert-permute-debug.py b/convert-permute-debug.py
index 14927e8e4..fcbb5f2da 100644
--- a/convert-permute-debug.py
+++ b/convert-permute-debug.py
@@ -819,12 +819,12 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
     for i in itertools.count():
         if f"model.layers.{i}.self_attn.q_proj.weight" in model:
             print(f"Permuting layer {i}")
-            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head_kv)
+            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
             tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
            #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
         elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
             print(f"Unpacking and permuting layer {i}")
-            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head_kv)
+            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
             tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
             tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy        (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
         else:

From d5c8fcfd8ab2db5405c615cae2c74334e03afb65 Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 21 Aug 2023 04:33:33 +0200
Subject: [PATCH 10/25] convert.py : 70b model working (change attn_q permute)

---
 convert.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/convert.py b/convert.py
index f6237579d..df589928b 100755
--- a/convert.py
+++ b/convert.py
@@ -326,6 +326,7 @@ Vocab = Union[BpeVocab, SentencePieceVocab]
 #
 
 def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
+    #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
     if n_head_kv is not None and n_head != n_head_kv:
         n_head //= n_head_kv
     return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@@ -818,12 +819,12 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
     for i in itertools.count():
         if f"model.layers.{i}.self_attn.q_proj.weight" in model:
             print(f"Permuting layer {i}")
-            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head_kv)
+            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
             tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
            #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
         elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
             print(f"Unpacking and permuting layer {i}")
-            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head_kv)
+            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
             tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
             tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy        (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
         else:

From 287db5101539f1eb87b3ae8218ef1a5cceade991 Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 21 Aug 2023 04:34:39 +0200
Subject: [PATCH 11/25] Delete convert-permute-debug-master.py

---
 convert-permute-debug-master.py | 1327 -------------------------------
 1 file changed, 1327 deletions(-)
 delete mode 100644 convert-permute-debug-master.py

diff --git a/convert-permute-debug-master.py b/convert-permute-debug-master.py
deleted file mode 100644
index 7d64b2252..000000000
--- a/convert-permute-debug-master.py
+++ /dev/null
@@ -1,1327 +0,0 @@
-#!/usr/bin/env python
-import argparse
-import concurrent.futures
-import copy
-import enum
-import faulthandler
-import functools
-import io
-import itertools
-import json
-import math
-import mmap
-import pickle
-import re
-import signal
-import struct
-import sys
-import zipfile
-from abc import ABCMeta, abstractmethod
-from dataclasses import dataclass
-from pathlib import Path
-from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
-                    Literal, Optional, Sequence, Tuple, TypeVar, Union)
-
-import numpy as np
-from sentencepiece import SentencePieceProcessor  # type: ignore
-
-if TYPE_CHECKING:
-    from typing_extensions import TypeAlias
-
-if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
-    faulthandler.register(signal.SIGUSR1)
-
-NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
-
-
-@dataclass(frozen=True)
-class UnquantizedDataType:
-    name: str
-
-
-DT_F16 = UnquantizedDataType('F16')
-DT_F32 = UnquantizedDataType('F32')
-DT_I32 = UnquantizedDataType('I32')
-DT_BF16 = UnquantizedDataType('BF16')
-
-
-@dataclass(frozen=True)
-class QuantizedDataType:
-    groupsize: int
-    have_addends: bool
-    have_g_idx: bool
-
-
-DT_Q4_0 = QuantizedDataType(groupsize=32, have_addends=False, have_g_idx=False)
-DT_Q4_1 = QuantizedDataType(groupsize=32, have_addends=True, have_g_idx=False)
-
-DataType = Union[UnquantizedDataType, QuantizedDataType]
-
-DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {
-    DT_F32: 0,
-    DT_F16: 1,
-    DT_Q4_0: 2,
-    DT_Q4_1: 3,
-}
-
-FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \
-    {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()}
-
-DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
-    DT_BF16: np.dtype(np.uint16),
-    DT_F16: np.dtype(np.float16),
-    DT_F32: np.dtype(np.float32),
-    DT_I32: np.dtype(np.int32),
-}
-
-NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
-    {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
-
-
-class GGMLFileType(enum.Enum):
-    AllF32 = 0
-    MostlyF16 = 1  # except 1d tensors
-    MostlyQ4_0 = 2  # except 1d tensors
-    MostlyQ4_1 = 3  # except 1d tensors
-    PerLayerIsQ4_1 = 4  # but tok_embeddings.weight and output.weight are F16
-
-    def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
-        if len(tensor.shape) == 1:
-            # 1D tensors are always F32.
-            return DT_F32
-        elif self == GGMLFileType.AllF32:
-            return DT_F32
-        elif self == GGMLFileType.MostlyF16:
-            return DT_F16
-        elif self == GGMLFileType.MostlyQ4_0:
-            return DT_Q4_0
-        elif self == GGMLFileType.MostlyQ4_1:
-            return DT_Q4_1
-        elif self == GGMLFileType.PerLayerIsQ4_1:
-            if name in ('output.weight', 'tok_embeddings.weight'):
-                return DT_F16
-            else:
-                return DT_Q4_1
-        else:
-            raise ValueError(self)
-
-
-def make_tensors_list() -> List[str]:
-    ret = [
-        'tok_embeddings.weight',
-        'norm.weight',
-        'output.weight',
-    ]
-    for i in range(80):  # maximum number of layer
-        ret += [
-            f'layers.{i}.attention.wq.weight',
-            f'layers.{i}.attention.wk.weight',
-            f'layers.{i}.attention.wv.weight',
-            f'layers.{i}.attention.wo.weight',
-            f'layers.{i}.attention_norm.weight',
-            f'layers.{i}.feed_forward.w1.weight',
-            f'layers.{i}.feed_forward.w2.weight',
-            f'layers.{i}.feed_forward.w3.weight',
-            f'layers.{i}.ffn_norm.weight',
-        ]
-    return ret
-
-
-TENSORS_LIST = make_tensors_list()
-TENSORS_SET = set(TENSORS_LIST)
-
-
-def find_n_mult(n_ff: int, n_embd: int) -> int:
-    # hardcoded magic range
-    for n_mult in range(8192, 1, -1):
-        calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
-        if calc_ff == n_ff:
-            return n_mult
-    raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
-
-@dataclass
-class Params:
-    n_vocab:   int
-    n_embd:    int
-    n_mult:    int
-    n_head:    int
-    n_layer:   int
-    n_kv_head: Optional[int]  # This parameter is only used for Llama 2
-
-    @staticmethod
-    def guessed(model: 'LazyModel') -> 'Params':
-        # try transformer naming first
-        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
-
-        # try transformer naming first
-        if "model.layers.0.self_attn.q_proj.weight" in model:
-            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
-        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
-            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
-        else:
-            n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
-
-        if n_layer < 1:
-            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
-
-        n_head=n_embd // 128 # guessed
-
-        return Params(
-            n_vocab   = n_vocab,
-            n_embd    = n_embd,
-            n_mult    = 256,
-            n_head    = n_head,
-            n_layer   = n_layer,
-            n_kv_head = None,
-        )
-
-    @staticmethod
-    def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
-        config = json.load(open(config_path))
-
-        n_vocab = config["vocab_size"];
-        n_embd  = config["hidden_size"];
-        n_head  = config["num_attention_heads"];
-        n_layer = config["num_hidden_layers"];
-        n_ff    = config["intermediate_size"];
-        n_kv_head = config.get("num_key_value_heads")
-
-        n_mult = find_n_mult(n_ff, n_embd);
-
-        return Params(
-            n_vocab   = n_vocab,
-            n_embd    = n_embd,
-            n_mult    = n_mult,
-            n_head    = n_head,
-            n_layer   = n_layer,
-            n_kv_head = n_kv_head,
-        )
-
-    # LLaMA v2 70B params.json
-    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
-    @staticmethod
-    def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
-        config = json.load(open(config_path))
-
-        n_vocab   = config["vocab_size"];
-        n_embd    = config["dim"];
-        n_head    = config["n_heads"];
-        n_layer   = config["n_layers"];
-        n_mult    = config["multiple_of"];
-
-        if n_vocab == -1:
-            n_vocab = model["tok_embeddings.weight"].shape[0]
-
-        return Params(
-            n_vocab   = n_vocab,
-            n_embd    = n_embd,
-            n_mult    = n_mult,
-            n_head    = n_head,
-            n_layer   = n_layer,
-            n_kv_head = None,
-        )
-
-    @staticmethod
-    def load(model_plus: 'ModelPlus') -> 'Params':
-        hf_config_path   = model_plus.paths[0].parent / "config.json"
-        orig_config_path = model_plus.paths[0].parent / "params.json"
-
-        if hf_config_path.exists():
-            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
-        elif orig_config_path.exists():
-            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
-        else:
-            params = Params.guessed(model_plus.model)
-
-        print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
-        return params
-
-
-class SentencePieceVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
-        self.vocabtype = vocabtype
-        if self.vocabtype == "bpe":
-          self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
-        else:
-          self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
-        added_tokens: Dict[str, int]
-        if fname_added_tokens is not None:
-            added_tokens = json.load(open(fname_added_tokens))
-        else:
-            added_tokens = {}
-        if self.vocabtype == "bpe":
-          vocab_size: int = len(self.sentencepiece_tokenizer)
-        else:
-          vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
-        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids = sorted(added_tokens.values())
-        if expected_ids != actual_ids:
-            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
-        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_list = [text for (text, idx) in items]
-        self.vocab_size_base: int = vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
-
-    def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
-        tokenizer = self.sentencepiece_tokenizer
-        if self.vocabtype == "bpe":
-          from transformers.models.gpt2 import tokenization_gpt2
-          byte_encoder = tokenization_gpt2.bytes_to_unicode()
-          byte_decoder = {v: k for k, v in byte_encoder.items()}
-          for i, item in enumerate(tokenizer):
-            text: bytes
-            text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
-            score: float = -i
-            yield text, score
-        else:
-          for i in range(tokenizer.vocab_size()):
-              text: bytes
-              if tokenizer.is_unknown(i):
-                  text = " \u2047 ".encode("utf-8")
-              elif tokenizer.is_control(i):
-                  text = b""
-              elif tokenizer.is_byte(i):
-                  piece = tokenizer.id_to_piece(i)
-                  if len(piece) != 6:
-                      raise Exception(f"Invalid token: {piece}")
-                  byte_value = int(piece[3:-1], 16)
-                  text = struct.pack("B", byte_value)
-              else:
-                  text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
-              score: float = tokenizer.get_score(i)
-              yield text, score
-
-    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
-        for text in self.added_tokens_list:
-            score = -1000.0
-            yield text.encode("utf-8"), score
-
-    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
-        yield from self.sentencepiece_tokens()
-        yield from self.added_tokens()
-
-    def __repr__(self) -> str:
-        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-
-
-class GGMLVocab:
-    def __init__(self, tokens: List[Tuple[bytes, float]]):
-        self.tokens = tokens
-        self.vocab_size = len(tokens)
-
-    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
-        return self.tokens
-
-    def __repr__(self) -> str:
-        return f"<GGMLVocab with {self.vocab_size} tokens>"
-
-
-Vocab = Union[SentencePieceVocab, GGMLVocab]
-
-
-def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
-    print( "permute " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
-    if n_kv_head is not None and n_head != n_kv_head:
-        n_head //= n_kv_head
-    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-                .swapaxes(1, 2)
-                .reshape(weights.shape))
-
-
-def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray:
-    # First reinterpret each row from a list of int32s containing 8 values each
-    # to a list of uint8s containing 2 values each.
-    qvalues_pack8 = qvalues_pack32.view(np.uint8)
-
-    # Then split out the two values per int8 (which requires an actual
-    # conversion because numpy doesn't natively support int4s).
-    qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8)
-    qvalues[:, 0::2] = qvalues_pack8 & 0xf
-    qvalues[:, 1::2] = qvalues_pack8 >> 4
-
-    assert addends is None or addends.shape == scales.shape
-    assert qvalues.shape[0] == scales.shape[0]
-    assert qvalues.shape[1] % scales.shape[1] == 0
-    if g_idx is None:
-        repeat_count = qvalues.shape[1] // scales.shape[1]
-        scales = scales[:, :, np.newaxis]
-        if addends is not None:
-            addends = addends[:, :, np.newaxis]
-        # Reshape so that the below computation broadcasts over scales and addends:
-        qvalues.shape = (qvalues.shape[0], scales.shape[1], int(repeat_count))
-    else:
-        # In this case the scale and addend is selected for each column by g_idx:
-        assert addends is not None
-        scales = scales[:, g_idx]
-        addends = addends[:, g_idx]
-    if addends is None:
-        # Q4_0
-        qvalues = qvalues.view(np.int8)
-        qvalues -= 8
-    # And do the actual 'value = scale * qvalue + addend' computation.
-    values = scales * qvalues
-    if addends is not None:
-        values += addends
-    if g_idx is None:
-        values.shape = (values.shape[0], values.shape[1] * values.shape[2])
-    return values
-
-
-class Tensor(metaclass=ABCMeta):
-    data_type: DataType
-
-    @abstractmethod
-    def astype(self, data_type: DataType) -> 'Tensor': ...
-    @abstractmethod
-    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ...
-    @abstractmethod
-    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
-    @abstractmethod
-    def part(self, n_part: int) -> 'UnquantizedTensor': ...
-    @abstractmethod
-    def to_ggml(self) -> 'GGMLCompatibleTensor': ...
-
-
-def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
-    assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
-    fp32_arr = bf16_arr.astype(np.uint32) << 16
-    return fp32_arr.view(np.float32)
-
-
-class UnquantizedTensor(Tensor):
-    def __init__(self, ndarray: NDArray) -> None:
-        assert isinstance(ndarray, np.ndarray)
-        self.ndarray = ndarray
-        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
-
-    def astype(self, data_type: DataType) -> Tensor:
-        dtype = DATA_TYPE_TO_NUMPY[data_type]
-        if self.data_type == DT_BF16:
-            self.ndarray = bf16_to_fp32(self.ndarray)
-        return UnquantizedTensor(self.ndarray.astype(dtype))
-
-    def to_ggml(self) -> 'UnquantizedTensor':
-        return self
-
-    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
-        r = self.ndarray.shape[0] // 3
-        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
-
-    def part(self, n_part: int) -> 'UnquantizedTensor':
-        r = self.ndarray.shape[0] // 3
-        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
-
-    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor':
-        return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head))
-
-
-def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
-    tensor = lazy_tensor.load()
-    assert isinstance(tensor, UnquantizedTensor)
-
-    # double-check:
-    actual_shape = list(tensor.ndarray.shape)
-    assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
-    if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
-        if convert:
-            tensor.ndarray = tensor.ndarray.astype(expected_dtype)
-        else:
-            raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
-
-    return tensor.ndarray
-
-
-class GGMLQuantizedTensor(Tensor):
-    data_type: QuantizedDataType
-
-    def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None:
-        rows, columns = shape
-        assert data_type in (DT_Q4_1, DT_Q4_0)  # for now
-        assert isinstance(data_type, QuantizedDataType)  # redundant, but mypy complains without this
-        assert columns % data_type.groupsize == 0
-        words_in_block = 6 if data_type == DT_Q4_1 else 5
-        self.ndarray = ndarray.view(dtype=np.uint32).reshape((rows, columns // data_type.groupsize, words_in_block))
-        self.shape = shape[:]
-        self.data_type = data_type
-
-    def astype(self, data_type: DataType) -> Tensor:
-        if data_type == self.data_type:
-            return self
-        scales = self.ndarray[:, :, 0].view(np.float32)
-        if self.data_type.have_addends:
-            addends = self.ndarray[:, :, 1].view(np.float32)
-        else:
-            addends = None
-        qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8])
-
-        dq = dequantize_q4(qweights, scales, addends, g_idx=None)
-        return UnquantizedTensor(dq).astype(data_type)
-
-    def to_ggml(self) -> 'GGMLQuantizedTensor':
-        return self
-
-    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor':
-        return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type)
-
-    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
-        r = self.ndarray.shape[0] // 3
-        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
-
-    def part(self, n_part: int) -> 'UnquantizedTensor':
-        r = self.ndarray.shape[0] // 3
-        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
-
-GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
-
-
-class DeferredPermutedTensor(Tensor):
-    def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
-        self.base = base
-        self.n_head = n_head
-        self.n_kv_head = n_kv_head
-        self.data_type = self.base.data_type
-
-    def astype(self, data_type: DataType) -> Tensor:
-        return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)
-
-    def to_ggml(self) -> GGMLCompatibleTensor:
-        return self.base.to_ggml().permute(self.n_head, self.n_kv_head)
-
-    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
-        raise Exception("shouldn't permute twice")
-
-
-class GPTQForLLaMaQuantizedTensor(Tensor):
-    def __init__(self, model: 'LazyModel', namebase: str) -> None:
-        qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32)
-        scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True)
-
-        bias = model.get(f"{namebase}.bias")
-        if bias is not None:
-            # Q4_1 does not support bias; good thing the bias is always all zeros.
-            assert not np.any(load_unquantized(bias))
-
-        if f"{namebase}.zeros" in model:
-            zeros = load_unquantized(model[f"{namebase}.zeros"], np.float32)
-        else:
-            qzeros = load_unquantized(model[f"{namebase}.qzeros"], np.int32)
-            assert qzeros.dtype == np.int32
-            zeros = dequantize_q4(qzeros, scales, scales, g_idx=None)
-            assert zeros.dtype == np.float32
-
-        assert zeros.shape == scales.shape
-
-        # Output is transposed compared to the input, and addends have their sign flipped.
-        # Scales and zeros similarly must be transposed but only for newer
-        # versions of GPTQ-for-LLaMa; the older versions can be identified by
-        # having shape (n_embd, 1).
-        qweight = qweight.T
-        if scales.shape[1] != 1:
-            scales = scales.T
-            zeros = zeros.T
-
-        # Output also has signs flipped for the addends.
-        self.qweight = qweight
-        self.scales = scales
-        self.addends = -zeros
-
-        self.g_idx: Optional[NDArray]
-        if f"{namebase}.g_idx" in model:
-            self.g_idx = load_unquantized(model[f"{namebase}.g_idx"], np.int32)
-            assert self.g_idx.shape == (qweight.shape[1] * 8,)
-        else:
-            self.g_idx = None
-
-        self.shape = [self.qweight.shape[0], self.qweight.shape[1] * 8]
-        self.data_type = QuantizedDataType(groupsize=self.groupsize(), have_addends=True,
-                                           have_g_idx=(self.g_idx is not None))
-
-    def inspect(self, row: int, col: int) -> None:
-        '''For debugging.'''
-        qweight = (self.qweight[row, col // 8] >> (4 * (col & 7))) & 0xf
-        if self.g_idx is not None:
-            group = self.g_idx[col]
-        else:
-            group = int(col // self.groupsize())
-        scale = self.scales[row, group]
-        addend = self.addends[row, group]
-        with np.printoptions(precision=None, suppress=True):
-            print(f'scale:{scale} addend:{addend} qweight:{qweight}')
-            print('possible values:', np.arange(16) * scale + addend)
-            print('actual value:', qweight * scale + addend)
-
-    def astype(self, data_type: DataType) -> Tensor:
-        if isinstance(data_type, QuantizedDataType):
-            assert self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False
-            return self.regroup(data_type.groupsize)
-
-        dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx)
-        return UnquantizedTensor(dequantized).astype(data_type)
-
-    def groupsize(self) -> int:
-        assert self.addends.shape == self.scales.shape
-        assert self.shape[1] % self.scales.shape[1] == 0
-        return self.shape[1] // self.scales.shape[1]
-
-    def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor':
-        # Old versions of GPTQ-for-LLaMa shared scales and addends between all the
-        # columns in a row.  Newer versions share them between every set of N
-        # columns in a row, where N is the `groupsize` parameter, usually 128.  The
-        # output format shares them between every set of 32 columns.  To handle
-        # this, duplicate scales and addends for every smaller group.
-        # (In the above, 'row' and 'column' are in the sense of the output.)
-        assert self.g_idx is None
-        old_groupsize = self.groupsize()
-        assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize
-        ret = copy.copy(self)
-        ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1)
-        ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1)
-        ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
-        return ret
-
-    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
-        return DeferredPermutedTensor(self, n_head, n_kv_head)
-
-    def to_ggml(self) -> GGMLQuantizedTensor:
-        # The output format looks like this:
-        # For each row:
-        #   For each group of 32 columns:
-        #     - addend (float32, 4 bytes)
-        #     - scale (float32, 4 bytes)
-        #     - weights (int4 * 32, 16 bytes)
-
-        if self.groupsize() != 32:
-            raise Exception("should have been regrouped before converting to ggml")
-
-        # Since the output format is mixed between integers and floats, we have
-        # to hackily view the floats as int32s just so numpy will let us
-        # concatenate them.
-        addends_view = self.addends.view(dtype=np.int32)[:, :, np.newaxis]
-        scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis]
-
-        # Split into groups of 4 columns (i.e. 32 columns of quantized data):
-        grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4])
-
-        # And concatenate:
-        grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting='no')
-
-        return GGMLQuantizedTensor(grouped, self.shape, DT_Q4_1)
-
-
-@dataclass
-class LazyTensor:
-    _load: Callable[[], Tensor]
-    shape: List[int]
-    data_type: DataType
-    description: str
-
-    def load(self) -> Tensor:
-        ret = self._load()
-        assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
-        return ret
-
-    def astype(self, data_type: DataType) -> 'LazyTensor':
-        self.validate_conversion_to(data_type)
-
-        def load() -> Tensor:
-            return self.load().astype(data_type)
-        return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
-
-    def validate_conversion_to(self, data_type: DataType) -> None:
-        if data_type == self.data_type:
-            return
-        if isinstance(data_type, QuantizedDataType):
-            if not isinstance(self.data_type, QuantizedDataType):
-                raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
-            if self.data_type.have_g_idx:
-                sys.stderr.write(
-                    "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), "
-                    "which is not yet natively supported by GGML. "
-                    "For now you can still convert this model by passing `--outtype f16` to dequantize, "
-                    "but that will result in a much larger output file for no quality benefit.\n")
-                sys.exit(1)
-            assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
-
-
-LazyModel = Dict[str, LazyTensor]
-
-
-@dataclass
-class ModelPlus:
-    model: LazyModel
-    paths: List[Path]  # Where this was read from.
-    format: Literal['ggml', 'torch', 'safetensors']
-    vocab: Optional[Vocab]  # For GGML models (which have vocab built in), the vocab.
-
-
-def merge_sharded(models: List[LazyModel]) -> LazyModel:
-    # Original LLaMA models have each file contain one part of each tensor.
-    # Use a dict instead of a set to preserve order.
-    names = {name: None for model in models for name in model}
-
-    def convert(name: str) -> LazyTensor:
-        lazy_tensors: List[LazyTensor] = [model[name] for model in models]
-        if len(lazy_tensors) == 1:
-            # only one file; don't go through this procedure since there might
-            # be quantized tensors
-            return lazy_tensors[0]
-        if len(lazy_tensors[0].shape) == 1:
-            # the tensor is just duplicated in every file
-            return lazy_tensors[0]
-        if name.startswith('tok_embeddings.') or \
-           name.endswith('.attention.wo.weight') or \
-           name.endswith('.feed_forward.w2.weight'):
-            # split by columns
-            axis = 1
-        else:
-            # split by rows
-            axis = 0
-        concatenated_shape = list(lazy_tensors[0].shape)
-        concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
-
-        def load() -> UnquantizedTensor:
-            ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
-            concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
-            return UnquantizedTensor(concatenated)
-        description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
-        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
-    return {name: convert(name) for name in names}
-
-
-def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
-    formats = set(mp.format for mp in models_plus)
-    assert len(formats) == 1, "different formats?"
-    format = formats.pop()
-    paths = [path for mp in models_plus for path in mp.paths]
-    # Use the first non-None vocab, if any.
-    try:
-        vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
-    except StopIteration:
-        vocab = None
-
-    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
-        # Transformers models put different tensors in different files, but
-        # don't split indivdual tensors between files.
-        model: LazyModel = {}
-        for mp in models_plus:
-            model.update(mp.model)
-    else:
-        model = merge_sharded([mp.model for mp in models_plus])
-
-    return ModelPlus(model, paths, format, vocab)
-
-
-def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor:
-    def load() -> Tensor:
-        return lazy_tensor.load().permute(n_head, n_kv_head)
-    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description)
-
-def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
-    def load() -> Tensor:
-        return lazy_tensor.load().permute_part(n_part, n_head)
-    s = lazy_tensor.shape.copy()
-    s[0] = s[0] // 3
-    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
-
-def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
-    def load() -> Tensor:
-        return lazy_tensor.load().part(n_part)
-    s = lazy_tensor.shape.copy()
-    s[0] = s[0] // 3
-    return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
-
-def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
-    out: LazyModel = {}
-    out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
-    out["norm.weight"] = model["model.norm.weight"]
-    out["output.weight"] = model["lm_head.weight"]
-
-    for i in itertools.count():
-        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
-            out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
-            out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
-            out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
-        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
-            out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
-            out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
-            out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
-        else:
-            break
-
-        out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
-
-        out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
-        out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"]
-        out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"]
-
-        out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"]
-        out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"]
-    return out
-
-
-def handle_quantization(model: LazyModel) -> LazyModel:
-    '''Convert a model with entries for 'foo.qweight', 'foo.scales', etc.
-    (which resolve to UnquantizedTensors with the raw data) to one with entries
-    for 'foo.weight' (which resolve to QuantizedTensors).
-    '''
-    def convert(name: str) -> Tuple[str, LazyTensor]:
-        if name.endswith(".qweight"):
-            namebase = name.rsplit('.', 1)[0]
-            orig_name = namebase + ".weight"
-
-            lazy_tensor = model[name]
-            assert len(lazy_tensor.shape) == 2
-            real_shape = [lazy_tensor.shape[1], lazy_tensor.shape[0] * 8]
-
-            # Calculate type.  This replicates the logic in
-            # GPTQForLLaMaQuantizedTensor (which is executed when the modelis
-            # actually loaded).
-            lazy_scales = model[f"{namebase}.scales"]
-            scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0]
-            assert real_shape[1] % scales_width == 0
-            groupsize = real_shape[1] // scales_width
-            have_g_idx = f"{namebase}.g_idx" in model
-            data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx)
-
-            def load() -> Tensor:
-                return GPTQForLLaMaQuantizedTensor(model, namebase)
-
-            return (orig_name, LazyTensor(load, real_shape, data_type, '[quantized]'))
-        else:
-            return (name, model[name])
-    return dict(convert(name) for name in model)
-
-# Functionality that simulates `torch.load` but where individual tensors are
-# only loaded into memory on demand, not all at once.
-# PyTorch can't do this natively as of time of writing:
-# - https://github.com/pytorch/pytorch/issues/64327
-# This allows us to de-shard without multiplying RAM usage, and also
-# conveniently drops the PyTorch dependency (though we still need numpy).
-
-
-@dataclass
-class LazyStorageKind:
-    data_type: DataType
-
-
-@dataclass
-class LazyStorage:
-    load: Callable[[int, int], NDArray]
-    kind: LazyStorageKind
-    description: str
-
-
-class LazyUnpickler(pickle.Unpickler):
-    def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
-        super().__init__(fp)
-        self.data_base_path = data_base_path
-        self.zip_file = zip_file
-
-    def persistent_load(self, pid: Any) -> Any:
-        assert pid[0] == 'storage'
-        assert isinstance(pid[1], LazyStorageKind)
-        data_type = pid[1].data_type
-        filename_stem = pid[2]
-        filename = self.data_base_path + '/' + filename_stem
-        info = self.zip_file.getinfo(filename)
-
-        def load(offset: int, elm_count: int) -> NDArray:
-            dtype = DATA_TYPE_TO_NUMPY.get(data_type)
-            if dtype is None:
-                raise Exception("tensor stored in unsupported format")
-            fp = self.zip_file.open(info)
-            fp.seek(offset * dtype.itemsize)
-            size = elm_count * dtype.itemsize
-            data = fp.read(size)
-            assert len(data) == size
-            return np.frombuffer(data, dtype)
-        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
-        return LazyStorage(load=load, kind=pid[1], description=description)
-
-    # @staticmethod
-    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
-                               # pyright: ignore[reportSelfClsParameterName]
-                               requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
-        assert isinstance(storage, LazyStorage)
-
-        def load() -> UnquantizedTensor:
-            elm_count = stride[0] * size[0]
-            return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
-        description = f'pickled storage_offset={storage_offset} in {storage.description}'
-        return LazyTensor(load, list(size), storage.kind.data_type, description)
-
-    # @staticmethod
-    def rebuild_from_type_v2(func, new_type, args, state):
-        return func(*args)
-
-    CLASSES: Dict[Any, Any] = {
-        ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2,
-        ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
-        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
-        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
-        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
-        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
-        ('torch', 'Tensor'): LazyTensor,
-    }
-
-    def find_class(self, module: str, name: str) -> Any:
-        if not module.startswith('torch'):
-            return super().find_class(module, name)
-        return self.CLASSES[(module, name)]
-
-
-def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
-    zf = zipfile.ZipFile(outer_fp)
-    pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
-    assert len(pickle_paths) == 1, pickle_paths
-    pickle_fp = zf.open(pickle_paths[0], 'r')
-    unpickler = LazyUnpickler(pickle_fp,
-                              data_base_path=pickle_paths[0][:-4],
-                              zip_file=zf)
-    model = unpickler.load()
-    as_dict = dict(model.items())
-    return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
-
-
-SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
-    'BF16': DT_BF16,
-    'F16': DT_F16,
-    'F32': DT_F32,
-    'I32': DT_I32,
-}
-
-
-def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
-    header_size, = struct.unpack('<Q', fp.read(8))
-    header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
-    # Use mmap for the actual data to avoid race conditions with the file offset.
-    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
-    byte_buf = mapped[8 + header_size:]
-
-    def convert(info: Dict[str, Any]) -> LazyTensor:
-        data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
-        numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
-        shape: List[int] = info['shape']
-        begin, end = info['data_offsets']
-        assert 0 <= begin <= end <= len(byte_buf)
-        assert end - begin == math.prod(shape) * numpy_dtype.itemsize
-        buf = byte_buf[begin:end]
-
-        def load() -> UnquantizedTensor:
-            return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
-        description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
-        return LazyTensor(load, shape, data_type, description)
-    model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
-    return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
-
-
-def must_read(fp: IO[bytes], length: int) -> bytes:
-    ret = fp.read(length)
-    if len(ret) < length:
-        raise Exception("unexpectedly reached end of file")
-    return ret
-
-
-def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
-    magic = must_read(fp, 4)[::-1]
-    if magic in (b'ggmf', b'ggjt'):
-        version, = struct.unpack("i", must_read(fp, 4))
-        assert version == 1
-    else:
-        assert magic == b'ggml'
-        version = None
-    n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack('<7i', must_read(fp, 28))
-
-    tokens: List[Tuple[bytes, float]] = []
-    for i in range(n_vocab):
-        if i == 32000:
-            # HACK: GPT4All messed with the format without changing the magic
-            # number.  Specifically, they changed the vocab section to contain
-            # `n_vocab - 1` tokens instead of `n_vocab` (i.e. omitting the
-            # extra pad token).  Try to detect if we're reading a file like
-            # this.
-            orig_pos = fp.tell()
-            fp.seek(20, io.SEEK_CUR)
-            is_gpt4all = fp.read(21) == b'tok_embeddings.weight'
-            fp.seek(orig_pos)
-            if is_gpt4all:
-                break
-
-        length, = struct.unpack("i", must_read(fp, 4))
-        text = must_read(fp, length)
-        if magic != b'ggml':
-            score, = struct.unpack("f", must_read(fp, 4))
-            tokens.append((text, score))
-    vocab = GGMLVocab(tokens) if magic != b'ggml' else None
-
-    model: LazyModel = {}
-    # Use mmap for the actual data to avoid race conditions with the file offset.
-    off = fp.raw.tell()
-    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
-    fp.raw.seek(off)  # needed on Windows
-
-    def read_tensor() -> None:  # this is a function so that variables captured in `load` don't change
-        shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
-        assert 0 <= shape_len <= 3
-        shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len)))
-        shape = shape[::-1]
-        name = must_read(fp, name_len).decode('utf-8')
-        data_type = FTYPE_TO_DATA_TYPE[ftype]
-
-        if magic == b'ggjt':
-            fp.seek((fp.tell() + 31) & -32)
-
-        if data_type == DT_Q4_1:
-            # See GPTQForLLaMaQuantizedTensor.ggml_ndarray()
-            size = 24 * (shape[1] // 32) * shape[0]
-        elif data_type == DT_Q4_0:
-            size = 20 * (shape[1] // 32) * shape[0]
-        else:
-            numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
-            elm_count = math.prod(shape)
-            size = elm_count * numpy_dtype.itemsize
-        offset = fp.tell()
-        buf = mapped[offset:offset+size]
-        fp.seek(size, io.SEEK_CUR)
-
-        def load() -> Tensor:
-            if isinstance(data_type, QuantizedDataType):
-                ndarray = np.frombuffer(buf, dtype=np.uint32)
-                return GGMLQuantizedTensor(ndarray, shape, data_type)
-            else:
-                return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
-        description = f'ggml offset={offset} type={data_type} path={path}'
-        model[name] = LazyTensor(load, shape, data_type, description)
-
-    while fp.read(1) != b'':
-        fp.seek(-1, io.SEEK_CUR)
-        read_tensor()
-
-    return ModelPlus(model=model, paths=[path], format='ggml', vocab=vocab)
-
-
-@functools.lru_cache(maxsize=None)
-def lazy_load_file(path: Path) -> ModelPlus:
-    fp = open(path, 'rb')
-    first8 = fp.read(8)
-    fp.seek(0)
-    if first8[:2] == b'PK':
-        # A zip file, i.e. PyTorch format
-        return lazy_load_torch_file(fp, path)
-    elif first8[2:4] == b'gg':
-        # GGML format
-        return lazy_load_ggml_file(fp, path)
-    elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
-        # Probably safetensors
-        return lazy_load_safetensors_file(fp, path)
-    else:
-        raise ValueError(f"unknown format: {path}")
-
-
-In = TypeVar('In')
-Out = TypeVar('Out')
-
-
-def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
-    '''Parallel map, but with backpressure.  If the caller doesn't call `next`
-    fast enough, this will stop calling `func` at some point rather than
-    letting results pile up in memory.  Specifically, there is a max of one
-    output value buffered per thread.'''
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        futures: List[concurrent.futures.Future[Out]] = []
-        items_rev = list(iterable)[::-1]
-        for i in range(min(concurrency, len(items_rev))):
-            futures.append(executor.submit(func, items_rev.pop()))
-        while futures:
-            result = futures.pop(0).result()
-            if items_rev:
-                futures.append(executor.submit(func, items_rev.pop()))
-            yield result
-
-
-def check_vocab_size(params: Params, vocab: Vocab) -> None:
-    if params.n_vocab != vocab.vocab_size:
-        # GGMLVocab comes from the same file as the model so shouldn't mismatch:
-        assert isinstance(vocab, SentencePieceVocab)
-        if params.n_vocab == vocab.vocab_size_base:
-            print("Ignoring added_tokens.json since model matches vocab size without it.")
-            vocab.added_tokens_list = []
-            vocab.vocab_size = vocab.vocab_size_base
-            return
-        msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
-        if vocab.fname_added_tokens is not None:
-            msg += f" combined with {vocab.fname_added_tokens}"
-        msg += f" has {vocab.vocab_size})."
-        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
-            msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
-        raise Exception(msg)
-
-
-class OutputFile:
-    def __init__(self, fname_out: Path) -> None:
-        self.fout = open(fname_out, "wb")
-
-    def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
-        self.fout.write(b"ggjt"[::-1])  # magic
-        values = [
-            1,  # file version
-            params.n_vocab,
-            params.n_embd,
-            params.n_mult,
-            params.n_head,
-            params.n_layer,
-            params.n_embd // params.n_head,  # rot (obsolete)
-            file_type.value,
-        ]
-        self.fout.write(struct.pack("i" * len(values), *values))
-
-    def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
-        sname = name.encode('utf-8')
-        self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type]))
-        self.fout.write(struct.pack("i" * len(shape), *shape[::-1]))
-        self.fout.write(sname)
-        self.fout.seek((self.fout.tell() + 31) & -32)
-
-    def write_vocab(self, vocab: Vocab) -> None:
-        for text, score in vocab.all_tokens():
-            self.fout.write(struct.pack("i", len(text)))
-            self.fout.write(text)
-            self.fout.write(struct.pack("f", score))
-
-    @staticmethod
-    def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
-        of = OutputFile(fname_out)
-        params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
-        of = OutputFile(fname_out)
-        of.write_file_header(params, file_type=GGMLFileType.AllF32)
-        of.write_vocab(vocab)
-        of.fout.close()
-
-    @staticmethod
-    def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
-        check_vocab_size(params, vocab)
-        of = OutputFile(fname_out)
-        of.write_file_header(params, file_type)
-        print("Writing vocab...")
-        of.write_vocab(vocab)
-
-        def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
-            name, lazy_tensor = item
-            return lazy_tensor.load().to_ggml().ndarray
-
-        ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=1)
-        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
-            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
-            padi = len(str(len(model)))
-            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
-            of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type)
-            ndarray.tofile(of.fout)
-        of.fout.close()
-
-
-def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
-    wq_type = model["layers.0.attention.wq.weight"].data_type
-    if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
-        return GGMLFileType.AllF32
-    if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
-        return GGMLFileType.MostlyF16
-    if output_type_str == "q4_1" or (output_type_str is None and isinstance(wq_type, QuantizedDataType) and
-                                     wq_type.have_addends):
-        if isinstance(model["output.weight"].data_type, QuantizedDataType):
-            return GGMLFileType.MostlyQ4_1
-        else:
-            return GGMLFileType.PerLayerIsQ4_1
-    if output_type_str == "q4_0" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)):
-        return GGMLFileType.MostlyQ4_0
-    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
-    raise Exception(f"Unexpected combination of types: {name_to_type}")
-
-
-def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
-    model = handle_quantization(model)
-
-    if "lm_head.weight" in model:
-        model = convert_transformers_to_orig(model, params)
-    model = filter_and_sort_tensors(model)
-
-    return model
-
-
-def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
-    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
-            for (name, tensor) in model.items()}
-
-
-def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
-    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
-    the nth path in the model.
-    '''
-    # Support the following patterns:
-    patterns: List[Tuple[str, str]] = [
-        # - x.00.pth, x.01.pth, etc.
-        (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
-        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
-        (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
-        # x.bin, x.bin.1, etc.
-        (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
-    ]
-    for regex, replacement in patterns:
-        if re.search(regex, path.name):
-            new_path = path.with_name(re.sub(regex, replacement, path.name))
-            if new_path.exists():
-                return new_path
-    return None
-
-
-def find_multifile_paths(path: Path) -> List[Path]:
-    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
-    the whole list of paths in the model.
-    '''
-    ret: List[Path] = []
-    for i in itertools.count():
-        nth_path = nth_multifile_path(path, i)
-        if nth_path is None:
-            break
-        ret.append(nth_path)
-    if not ret:
-        # No matches.  This should only happen if the file was named, e.g.,
-        # foo.0, and there was no file named foo.  Oh well, try to process it
-        # as a single file.
-        return [path]
-    return ret
-
-
-def load_some_model(path: Path) -> ModelPlus:
-    '''Load a model of any supported format.'''
-    # Be extra-friendly and accept either a file or a directory:
-    if path.is_dir():
-        # Check if it's a set of safetensors files first
-        files = list(path.glob("model-00001-of-*.safetensors"))
-        if not files:
-            # Try the PyTorch patterns too, with lower priority
-            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
-            files = [file for glob in globs for file in path.glob(glob)]
-        if not files:
-            # Try GGML too, but with lower priority, since if both a non-GGML
-            # model and a GGML model exist in the same directory, we assume the
-            # latter was converted from the former.
-            files = list(path.glob("ggml-model*.bin*"))
-        if not files:
-            raise Exception(f"Can't find model in directory {path}")
-        if len(files) > 1:
-            raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
-        path = files[0]
-
-    paths = find_multifile_paths(path)
-    models_plus: List[ModelPlus] = []
-    for path in paths:
-        print(f"Loading model file {path}")
-        models_plus.append(lazy_load_file(path))
-
-    model_plus = merge_multifile_models(models_plus)
-    return model_plus
-
-
-def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
-    return {name: model[name] for name in TENSORS_LIST if name in model}
-
-
-def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
-    print(f"vocabtype: {vocabtype}")
-    # Be extra-friendly and accept either a file or a directory.  Also, if it's
-    # a directory, it might be the model directory, and tokenizer.model might
-    # be in the parent of that.
-    if path.is_dir():
-        vocab_file = "tokenizer.model"
-        if vocabtype == 'bpe':
-          vocab_file = "vocab.json"
-        path2 = path / vocab_file
-        # Use `.parent` instead of /.. to handle the symlink case better.
-        path3 = path.parent / vocab_file
-        if path2.exists():
-            path = path2
-        elif path3.exists():
-            path = path3
-        else:
-            raise FileNotFoundError(
-                f"Could not find tokenizer.model in {path} or its parent; "
-                "if it's in another directory, pass the directory as --vocab-dir")
-    added_tokens_path = path.parent / "added_tokens.json"
-    print(f"Loading vocab file {path}")
-    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
-                              vocabtype)
-
-
-def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
-    namestr = {
-        GGMLFileType.AllF32: "f32",
-        GGMLFileType.MostlyF16: "f16",
-        GGMLFileType.MostlyQ4_0: "q4_0",
-        GGMLFileType.MostlyQ4_1: "q4_1",
-        GGMLFileType.PerLayerIsQ4_1: "q4_1",
-    }[file_type]
-    ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
-    if ret in model_paths:
-        sys.stderr.write(
-            f"Error: Default output path ({ret}) would overwrite the input. "
-            "Please explicitly specify a path using --outfile.\n")
-        sys.exit(1)
-    return ret
-
-
-def do_dump_model(model_plus: ModelPlus) -> None:
-    print(f"model_plus.paths = {model_plus.paths!r}")
-    print(f"model_plus.format = {model_plus.format!r}")
-    print(f"model_plus.vocab = {model_plus.vocab!r}")
-    for name, lazy_tensor in model_plus.model.items():
-        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
-
-
-def main(args_in: Optional[List[str]] = None) -> None:
-    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
-    parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
-    parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
-    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
-    parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
-    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
-    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("model", type=Path,
-                        help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
-    parser.add_argument("--vocabtype", default='spm', choices=["spm", "bpe"], help="vocab format (default: spm)")
-    args = parser.parse_args(args_in)
-
-    vocab: Vocab
-    if args.dump_single:
-        model_plus = lazy_load_file(args.model)
-        do_dump_model(model_plus)
-    elif args.vocab_only:
-        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
-        assert args.outfile, "need --outfile if using --vocab-only"
-        outfile = args.outfile
-        OutputFile.write_vocab_only(outfile, vocab)
-        print(f"Wrote {outfile}")
-    else:
-        model_plus = load_some_model(args.model)
-        if args.dump:
-            do_dump_model(model_plus)
-            return
-        if model_plus.vocab is not None and args.vocab_dir is None:
-            vocab = model_plus.vocab
-        else:
-            vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
-            vocab = load_vocab(vocab_dir, args.vocabtype)
-        params = Params.load(model_plus)
-        model = model_plus.model
-        model = do_necessary_conversions(model, params)
-        output_type = pick_output_type(model, args.outtype)
-        model = convert_to_output_type(model, output_type)
-        outfile = args.outfile or default_outfile(model_plus.paths, output_type)
-        OutputFile.write_all(outfile, params, output_type, model, vocab)
-        print(f"Wrote {outfile}")
-
-
-if __name__ == '__main__':
-    main()

From 58bde5c5c1c4d7f1201819a16734ea1b933279f4 Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 21 Aug 2023 04:35:06 +0200
Subject: [PATCH 12/25] Delete convert-permute-debug.py

---
 convert-permute-debug.py | 1032 --------------------------------------
 1 file changed, 1032 deletions(-)
 delete mode 100644 convert-permute-debug.py

diff --git a/convert-permute-debug.py b/convert-permute-debug.py
deleted file mode 100644
index fcbb5f2da..000000000
--- a/convert-permute-debug.py
+++ /dev/null
@@ -1,1032 +0,0 @@
-#!/usr/bin/env python
-
-import gguf
-import argparse
-import concurrent.futures
-import copy
-import enum
-import faulthandler
-import functools
-import io
-import itertools
-import json
-import math
-import mmap
-import pickle
-import re
-import signal
-import struct
-import sys
-import zipfile
-import numpy as np
-
-from abc import ABCMeta, abstractmethod
-from dataclasses import dataclass
-from pathlib import Path
-from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union)
-from sentencepiece import SentencePieceProcessor  # type: ignore
-
-if TYPE_CHECKING:
-    from typing_extensions import TypeAlias
-
-if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
-    faulthandler.register(signal.SIGUSR1)
-
-NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
-
-ARCH=gguf.MODEL_ARCH.LLAMA
-NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
-
-#
-# data types
-#
-
-@dataclass(frozen=True)
-class UnquantizedDataType:
-    name: str
-
-DT_F16  = UnquantizedDataType('F16')
-DT_F32  = UnquantizedDataType('F32')
-DT_I32  = UnquantizedDataType('I32')
-DT_BF16 = UnquantizedDataType('BF16')
-
-DataType = Union[UnquantizedDataType]
-
-DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
-    DT_BF16: np.dtype(np.uint16),
-    DT_F16:  np.dtype(np.float16),
-    DT_F32:  np.dtype(np.float32),
-    DT_I32:  np.dtype(np.int32),
-}
-
-NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
-    {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
-
-SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
-    'BF16': DT_BF16,
-    'F16': DT_F16,
-    'F32': DT_F32,
-    'I32': DT_I32,
-}
-
-class GGMLFileType(enum.Enum):
-    AllF32    = 0
-    MostlyF16 = 1  # except 1d tensors
-
-    def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
-        if len(tensor.shape) == 1:
-            # 1D tensors are always F32.
-            return DT_F32
-        elif self == GGMLFileType.AllF32:
-            return DT_F32
-        elif self == GGMLFileType.MostlyF16:
-            return DT_F16
-        else:
-            raise ValueError(self)
-
-
-#
-# hparams loading
-#
-
-@dataclass
-class Params:
-    n_vocab:    int
-    n_embd:     int
-    n_mult:     int
-    n_layer:    int
-    n_ctx:      int
-    n_ff:       int
-    n_head:     int
-    n_head_kv:  int
-    f_norm_eps: float
-
-    @staticmethod
-    def find_n_mult(n_ff: int, n_embd: int) -> int:
-        # hardcoded magic range
-        for n_mult in range(8192, 1, -1):
-            calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
-            if calc_ff == n_ff:
-                return n_mult
-        raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
-
-    @staticmethod
-    def guessed(model: 'LazyModel') -> 'Params':
-        # try transformer naming first
-        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
-
-        # try transformer naming first
-        if "model.layers.0.self_attn.q_proj.weight" in model:
-            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
-        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
-            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
-        else:
-            n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
-
-        if n_layer < 1:
-            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
-
-        n_head = n_embd // 128 # guessed
-        n_mult = 256           # guessed
-
-        # TODO: verify this
-        n_ff = int(2 * (4 * n_embd) / 3)
-        n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
-
-        return Params(
-            n_vocab    = n_vocab,
-            n_embd     = n_embd,
-            n_mult     = n_mult,
-            n_layer    = n_layer,
-            n_ctx      = -1,
-            n_ff       = n_ff,
-            n_head     = n_head,
-            n_head_kv  = n_head,
-            f_norm_eps = 1e-5,
-        )
-
-    @staticmethod
-    def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
-        config = json.load(open(config_path))
-
-        n_vocab    = config["vocab_size"]
-        n_embd     = config["hidden_size"]
-        n_layer    = config["num_hidden_layers"]
-        n_ff       = config["intermediate_size"]
-        n_head     = config["num_attention_heads"]
-        n_head_kv  = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
-        f_norm_eps = config["rms_norm_eps"]
-
-        n_mult = Params.find_n_mult(n_ff, n_embd)
-
-        if "max_sequence_length" in config:
-            n_ctx = config["max_sequence_length"]
-        elif "max_position_embeddings" in config:
-            n_ctx = config["max_position_embeddings"]
-        else:
-            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
-
-        return Params(
-            n_vocab    = n_vocab,
-            n_embd     = n_embd,
-            n_mult     = n_mult,
-            n_layer    = n_layer,
-            n_ctx      = n_ctx,
-            n_ff       = n_ff,
-            n_head     = n_head,
-            n_head_kv  = n_head_kv,
-            f_norm_eps = f_norm_eps,
-        )
-
-    # LLaMA v2 70B params.json
-    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
-    @staticmethod
-    def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
-        config = json.load(open(config_path))
-
-        n_vocab    = config["vocab_size"]
-        n_embd     = config["dim"]
-        n_layer    = config["n_layers"]
-        n_mult     = config["multiple_of"]
-        n_ctx      = 2048 if config["norm_eps"] == 1e-06 else 4096 # hack to determine LLaMA v1 vs v2
-        n_ff       = -1
-        n_head     = config["n_heads"]
-        n_head_kv  = config["n_kv_heads"] if "n_kv_heads" in config else n_head
-        f_norm_eps = config["norm_eps"]
-
-        if n_vocab == -1:
-            n_vocab = model["tok_embeddings.weight"].shape[0]
-
-        if n_ff == -1:
-            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
-
-        return Params(
-            n_vocab    = n_vocab,
-            n_embd     = n_embd,
-            n_mult     = n_mult,
-            n_layer    = n_layer,
-            n_ctx      = n_ctx,
-            n_ff       = n_ff,
-            n_head     = n_head,
-            n_head_kv  = n_head_kv,
-            f_norm_eps = f_norm_eps,
-        )
-
-    @staticmethod
-    def load(model_plus: 'ModelPlus') -> 'Params':
-        hf_config_path   = model_plus.paths[0].parent / "config.json"
-        orig_config_path = model_plus.paths[0].parent / "params.json"
-
-        if hf_config_path.exists():
-            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
-        elif orig_config_path.exists():
-            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
-        else:
-            params = Params.guessed(model_plus.model)
-
-        return params
-
-
-#
-# vocab
-#
-
-class BpeVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
-        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
-        added_tokens: Dict[str, int]
-        if fname_added_tokens is not None:
-            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
-        else:
-            added_tokens = {}
-        vocab_size: int = len(self.bpe_tokenizer)
-        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids = sorted(added_tokens.values())
-        if expected_ids != actual_ids:
-            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
-        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_list = [text for (text, idx) in items]
-        self.vocab_size_base: int = vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
-
-    def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
-        tokenizer = self.bpe_tokenizer
-        from transformers.models.gpt2 import tokenization_gpt2
-        byte_encoder = tokenization_gpt2.bytes_to_unicode()
-        byte_decoder = {v: k for k, v in byte_encoder.items()}
-        for i, item in enumerate(tokenizer):
-            text: bytes = item.encode("utf-8")
-            score: float = -i
-            yield text, score
-
-    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
-        for text in self.added_tokens_list:
-            score = -1000.0
-            yield text.encode("utf-8"), score
-
-    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
-        yield from self.bpe_tokens()
-        yield from self.added_tokens()
-
-    def __repr__(self) -> str:
-        return f"BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-
-
-class SentencePieceVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
-        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
-        added_tokens: Dict[str, int]
-        if fname_added_tokens is not None:
-            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
-        else:
-            added_tokens = {}
-        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
-        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids   = sorted(added_tokens.values())
-        if expected_ids != actual_ids:
-            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
-
-        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_list = [text for (text, idx) in items]
-        self.vocab_size_base: int = vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
-
-    def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
-        tokenizer = self.sentencepiece_tokenizer
-        for i in range(tokenizer.vocab_size()):
-            piece = tokenizer.id_to_piece(i)
-            text: bytes = piece.encode("utf-8")
-            score: float = tokenizer.get_score(i)
-            yield text, score
-
-    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
-        for text in self.added_tokens_list:
-            score = -1000.0
-            yield text.encode("utf-8"), score
-
-    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
-        yield from self.sentencepiece_tokens()
-        yield from self.added_tokens()
-
-    def __repr__(self) -> str:
-        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-
-Vocab = Union[BpeVocab, SentencePieceVocab]
-
-
-#
-# data loading
-# TODO: reuse (probably move to gguf.py?)
-#
-
-def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
-    print( "permute " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_head_kv) )
-    if n_head_kv is not None and n_head != n_head_kv:
-        n_head //= n_head_kv
-    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-                .swapaxes(1, 2)
-                .reshape(weights.shape))
-
-
-class Tensor(metaclass=ABCMeta):
-    data_type: DataType
-
-    @abstractmethod
-    def astype(self, data_type: DataType) -> 'Tensor': ...
-    @abstractmethod
-    def permute(self, n_head: int, n_head_kv: int) -> 'Tensor': ...
-    @abstractmethod
-    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
-    @abstractmethod
-    def part(self, n_part: int) -> 'UnquantizedTensor': ...
-    @abstractmethod
-    def to_ggml(self) -> 'GGMLCompatibleTensor': ...
-
-
-def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
-    assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
-    fp32_arr = bf16_arr.astype(np.uint32) << 16
-    return fp32_arr.view(np.float32)
-
-
-class UnquantizedTensor(Tensor):
-    def __init__(self, ndarray: NDArray) -> None:
-        assert isinstance(ndarray, np.ndarray)
-        self.ndarray = ndarray
-        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
-
-    def astype(self, data_type: DataType) -> Tensor:
-        dtype = DATA_TYPE_TO_NUMPY[data_type]
-        if self.data_type == DT_BF16:
-            self.ndarray = bf16_to_fp32(self.ndarray)
-        return UnquantizedTensor(self.ndarray.astype(dtype))
-
-    def to_ggml(self) -> 'UnquantizedTensor':
-        return self
-
-    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
-        r = self.ndarray.shape[0] // 3
-        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
-
-    def part(self, n_part: int) -> 'UnquantizedTensor':
-        r = self.ndarray.shape[0] // 3
-        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
-
-    def permute(self, n_head: int, n_head_kv: int) -> 'UnquantizedTensor':
-        return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
-
-
-def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
-    tensor = lazy_tensor.load()
-    assert isinstance(tensor, UnquantizedTensor)
-
-    # double-check:
-    actual_shape = list(tensor.ndarray.shape)
-    assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
-    if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
-        if convert:
-            tensor.ndarray = tensor.ndarray.astype(expected_dtype)
-        else:
-            raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
-
-    return tensor.ndarray
-
-
-GGMLCompatibleTensor = Union[UnquantizedTensor]
-
-
-class DeferredPermutedTensor(Tensor):
-    def __init__(self, base: Tensor, n_head: int, n_head_kv: int) -> None:
-        self.base = base
-        self.n_head = n_head
-        self.data_type = self.base.data_type
-
-    def astype(self, data_type: DataType) -> Tensor:
-        return self.base.astype(data_type).permute(self.n_head, self.n_head_kv)
-
-    def to_ggml(self) -> GGMLCompatibleTensor:
-        return self.base.to_ggml().permute(self.n_head, self.n_head_kv)
-
-    def permute(self, n_head: int, n_head_kv: int) -> Tensor:
-        raise Exception("shouldn't permute twice")
-
-
-@dataclass
-class LazyTensor:
-    _load: Callable[[], Tensor]
-    shape: List[int]
-    data_type: DataType
-    description: str
-
-    def load(self) -> Tensor:
-        ret = self._load()
-        assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
-        return ret
-
-    def astype(self, data_type: DataType) -> 'LazyTensor':
-        self.validate_conversion_to(data_type)
-
-        def load() -> Tensor:
-            return self.load().astype(data_type)
-        return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
-
-    def validate_conversion_to(self, data_type: DataType) -> None:
-        if data_type == self.data_type:
-            return
-
-
-LazyModel = Dict[str, LazyTensor]
-
-
-@dataclass
-class ModelPlus:
-    model: LazyModel
-    paths: List[Path]  # Where this was read from.
-    format: Literal['ggml', 'torch', 'safetensors']
-    vocab: Optional[Vocab]  # For GGML models (which have vocab built in), the vocab.
-
-
-def merge_sharded(models: List[LazyModel]) -> LazyModel:
-    # Original LLaMA models have each file contain one part of each tensor.
-    # Use a dict instead of a set to preserve order.
-    names = {name: None for model in models for name in model}
-
-    def convert(name: str) -> LazyTensor:
-        lazy_tensors: List[LazyTensor] = [model[name] for model in models]
-        if len(lazy_tensors) == 1:
-            # only one file; don't go through this procedure since there might
-            # be quantized tensors
-            return lazy_tensors[0]
-        if len(lazy_tensors[0].shape) == 1:
-            # the tensor is just duplicated in every file
-            return lazy_tensors[0]
-        if name.startswith('tok_embeddings.') or \
-           name.endswith('.attention.wo.weight') or \
-           name.endswith('.feed_forward.w2.weight'):
-            # split by columns
-            axis = 1
-        else:
-            # split by rows
-            axis = 0
-        concatenated_shape = list(lazy_tensors[0].shape)
-        concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
-
-        def load() -> UnquantizedTensor:
-            ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
-            concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
-            return UnquantizedTensor(concatenated)
-        description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
-        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
-    return {name: convert(name) for name in names}
-
-
-def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
-    formats = set(mp.format for mp in models_plus)
-    assert len(formats) == 1, "different formats?"
-    format = formats.pop()
-    paths = [path for mp in models_plus for path in mp.paths]
-    # Use the first non-None vocab, if any.
-    try:
-        vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
-    except StopIteration:
-        vocab = None
-
-    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
-        # Transformers models put different tensors in different files, but
-        # don't split indivdual tensors between files.
-        model: LazyModel = {}
-        for mp in models_plus:
-            model.update(mp.model)
-    else:
-        model = merge_sharded([mp.model for mp in models_plus])
-
-    return ModelPlus(model, paths, format, vocab)
-
-
-def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
-    def load() -> Tensor:
-        return lazy_tensor.load().permute(n_head, n_head_kv)
-    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
-
-def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
-    def load() -> Tensor:
-        return lazy_tensor.load().permute_part(n_part, n_head)
-    s = lazy_tensor.shape.copy()
-    s[0] = s[0] // 3
-    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
-
-def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
-    def load() -> Tensor:
-        return lazy_tensor.load().part(n_part)
-    s = lazy_tensor.shape.copy()
-    s[0] = s[0] // 3
-    return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
-
-
-# Functionality that simulates `torch.load` but where individual tensors are
-# only loaded into memory on demand, not all at once.
-# PyTorch can't do this natively as of time of writing:
-# - https://github.com/pytorch/pytorch/issues/64327
-# This allows us to de-shard without multiplying RAM usage, and also
-# conveniently drops the PyTorch dependency (though we still need numpy).
-
-
-@dataclass
-class LazyStorageKind:
-    data_type: DataType
-
-
-@dataclass
-class LazyStorage:
-    load: Callable[[int, int], NDArray]
-    kind: LazyStorageKind
-    description: str
-
-
-class LazyUnpickler(pickle.Unpickler):
-    def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
-        super().__init__(fp)
-        self.data_base_path = data_base_path
-        self.zip_file = zip_file
-
-    def persistent_load(self, pid: Any) -> Any:
-        assert pid[0] == 'storage'
-        assert isinstance(pid[1], LazyStorageKind)
-        data_type = pid[1].data_type
-        filename_stem = pid[2]
-        filename = self.data_base_path + '/' + filename_stem
-        info = self.zip_file.getinfo(filename)
-
-        def load(offset: int, elm_count: int) -> NDArray:
-            dtype = DATA_TYPE_TO_NUMPY.get(data_type)
-            if dtype is None:
-                raise Exception("tensor stored in unsupported format")
-            fp = self.zip_file.open(info)
-            fp.seek(offset * dtype.itemsize)
-            size = elm_count * dtype.itemsize
-            data = fp.read(size)
-            assert len(data) == size
-            return np.frombuffer(data, dtype)
-        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
-        return LazyStorage(load=load, kind=pid[1], description=description)
-
-    # @staticmethod
-    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
-                               # pyright: ignore[reportSelfClsParameterName]
-                               requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
-        assert isinstance(storage, LazyStorage)
-
-        def load() -> UnquantizedTensor:
-            elm_count = stride[0] * size[0]
-            return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
-        description = f'pickled storage_offset={storage_offset} in {storage.description}'
-        return LazyTensor(load, list(size), storage.kind.data_type, description)
-
-    # @staticmethod
-    def rebuild_from_type_v2(func, new_type, args, state):
-        return func(*args)
-
-    CLASSES: Dict[Any, Any] = {
-        ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2,
-        ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
-        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
-        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
-        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
-        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
-        ('torch', 'Tensor'): LazyTensor,
-    }
-
-    def find_class(self, module: str, name: str) -> Any:
-        if not module.startswith('torch'):
-            return super().find_class(module, name)
-        return self.CLASSES[(module, name)]
-
-
-def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
-    zf = zipfile.ZipFile(outer_fp)
-    pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
-    assert len(pickle_paths) == 1, pickle_paths
-    pickle_fp = zf.open(pickle_paths[0], 'r')
-    unpickler = LazyUnpickler(pickle_fp,
-                              data_base_path=pickle_paths[0][:-4],
-                              zip_file=zf)
-    model = unpickler.load()
-    as_dict = dict(model.items())
-    return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
-
-
-def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
-    header_size, = struct.unpack('<Q', fp.read(8))
-    header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
-    # Use mmap for the actual data to avoid race conditions with the file offset.
-    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
-    byte_buf = mapped[8 + header_size:]
-
-    def convert(info: Dict[str, Any]) -> LazyTensor:
-        data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
-        numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
-        shape: List[int] = info['shape']
-        begin, end = info['data_offsets']
-        assert 0 <= begin <= end <= len(byte_buf)
-        assert end - begin == math.prod(shape) * numpy_dtype.itemsize
-        buf = byte_buf[begin:end]
-
-        def load() -> UnquantizedTensor:
-            return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
-        description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
-        return LazyTensor(load, shape, data_type, description)
-    model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
-    return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
-
-
-def must_read(fp: IO[bytes], length: int) -> bytes:
-    ret = fp.read(length)
-    if len(ret) < length:
-        raise Exception("unexpectedly reached end of file")
-    return ret
-
-
-@functools.lru_cache(maxsize=None)
-def lazy_load_file(path: Path) -> ModelPlus:
-    fp = open(path, 'rb')
-    first8 = fp.read(8)
-    fp.seek(0)
-    if first8[:2] == b'PK':
-        # A zip file, i.e. PyTorch format
-        return lazy_load_torch_file(fp, path)
-    elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
-        # Probably safetensors
-        return lazy_load_safetensors_file(fp, path)
-    else:
-        raise ValueError(f"unknown format: {path}")
-
-
-In = TypeVar('In')
-Out = TypeVar('Out')
-
-def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
-    '''Parallel map, but with backpressure.  If the caller doesn't call `next`
-    fast enough, this will stop calling `func` at some point rather than
-    letting results pile up in memory.  Specifically, there is a max of one
-    output value buffered per thread.'''
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        futures: List[concurrent.futures.Future[Out]] = []
-        items_rev = list(iterable)[::-1]
-        for i in range(min(concurrency, len(items_rev))):
-            futures.append(executor.submit(func, items_rev.pop()))
-        while futures:
-            result = futures.pop(0).result()
-            if items_rev:
-                futures.append(executor.submit(func, items_rev.pop()))
-            yield result
-
-
-def check_vocab_size(params: Params, vocab: Vocab) -> None:
-    if params.n_vocab != vocab.vocab_size:
-        assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
-        if params.n_vocab == vocab.vocab_size_base:
-            print("Ignoring added_tokens.json since model matches vocab size without it.")
-            vocab.added_tokens_list = []
-            vocab.vocab_size = vocab.vocab_size_base
-            return
-        msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
-        if vocab.fname_added_tokens is not None:
-            msg += f" combined with {vocab.fname_added_tokens}"
-        msg += f" has {vocab.vocab_size})."
-        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
-            msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
-        raise Exception(msg)
-
-
-class OutputFile:
-    def __init__(self, fname_out: Path) -> None:
-        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
-
-    def add_meta_arch(self, params: Params) -> None:
-        self.gguf.add_context_length      (params.n_ctx)
-        self.gguf.add_embedding_length    (params.n_embd)
-        self.gguf.add_block_count         (params.n_layer)
-        self.gguf.add_feed_forward_length (params.n_ff)
-        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
-        self.gguf.add_head_count          (params.n_head)
-        self.gguf.add_head_count_kv       (params.n_head_kv)
-        self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps)
-
-    def add_meta_vocab(self, vocab: Vocab) -> None:
-        tokens = []
-        scores = []
-        for text, score in vocab.all_tokens():
-            tokens.append(text)
-            scores.append(score)
-
-        self.gguf.add_tokenizer_model("llama")
-        self.gguf.add_token_list(tokens)
-        self.gguf.add_token_scores(scores)
-        #self.gguf.add_token_types(toktypes) # TODO: add this
-
-        # TODO: added / special tokens
-
-    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
-        n_elements = 1
-        for dim in tensor.shape:
-            n_elements *= dim
-        data_type = DATA_TYPE_TO_NUMPY[tensor.data_type]
-        data_nbytes = n_elements * data_type.itemsize
-        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes)
-
-    def write_meta(self) -> None:
-        self.gguf.write_header_to_file()
-        self.gguf.write_kv_data_to_file()
-
-    def write_tensor_info(self) -> None:
-        self.gguf.write_ti_data_to_file()
-
-    def close(self) -> None:
-        self.gguf.close()
-
-    @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab) -> None:
-        check_vocab_size(params, vocab)
-
-        of = OutputFile(fname_out)
-
-        # meta data
-        of.add_meta_arch(params)
-        of.add_meta_vocab(vocab)
-        of.write_meta()
-
-        of.close()
-
-    @staticmethod
-    def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
-        check_vocab_size(params, vocab)
-
-        of = OutputFile(fname_out)
-
-        # meta data
-        of.add_meta_arch(params)
-        of.add_meta_vocab(vocab)
-
-        # tensor info
-        for name, lazy_tensor in model.items():
-            of.add_tensor_info(name, lazy_tensor)
-
-        of.write_meta()
-        of.write_tensor_info()
-
-        def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
-            name, lazy_tensor = item
-            return lazy_tensor.load().to_ggml().ndarray
-
-        # tensor data
-        ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=1)
-        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
-            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
-            padi = len(str(len(model)))
-            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
-            of.gguf.write_tensor_data(ndarray)
-
-        of.close()
-
-def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
-    wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
-
-    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
-        return GGMLFileType.AllF32
-    if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
-        return GGMLFileType.MostlyF16
-
-    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
-
-    raise Exception(f"Unexpected combination of types: {name_to_type}")
-
-def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
-    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
-            for (name, tensor) in model.items()}
-
-def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
-    tmap = gguf.get_tensor_name_map(ARCH, params.n_layer)
-
-    tmp = model
-
-    # HF models permut or pack some of the tensors, so we need to undo that
-    for i in itertools.count():
-        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
-            print(f"Permuting layer {i}")
-            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
-            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
-           #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
-        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
-            print(f"Unpacking and permuting layer {i}")
-            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
-            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
-            tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy        (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
-        else:
-            break
-
-    out: LazyModel = {}
-    for name, lazy_tensor in model.items():
-        name_new = name
-
-        if name in tmap:
-            name_new = tmap[name]
-        elif name.endswith(".weight") and name[:-7] in tmap:
-            name_new = tmap[name[:-7]] + ".weight"
-        elif name.endswith(".bias") and name[:-5] in tmap:
-            name_new = tmap[name[:-5]] + ".bias"
-        else:
-            raise Exception(f"Unexpected tensor name: {name}")
-
-        if gguf.should_skip_tensor_TMP(ARCH, params.n_layer, name_new):
-            print(f"skipping tensor {name_new}")
-            continue
-        else:
-            print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type} | {lazy_tensor.shape}")
-            out[name_new] = lazy_tensor
-
-    return out
-
-def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
-    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
-    the nth path in the model.
-    '''
-    # Support the following patterns:
-    patterns: List[Tuple[str, str]] = [
-        # - x.00.pth, x.01.pth, etc.
-        (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
-        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
-        (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
-        # x.bin, x.bin.1, etc.
-        (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
-    ]
-    for regex, replacement in patterns:
-        if re.search(regex, path.name):
-            new_path = path.with_name(re.sub(regex, replacement, path.name))
-            if new_path.exists():
-                return new_path
-    return None
-
-
-def find_multifile_paths(path: Path) -> List[Path]:
-    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
-    the whole list of paths in the model.
-    '''
-    ret: List[Path] = []
-    for i in itertools.count():
-        nth_path = nth_multifile_path(path, i)
-        if nth_path is None:
-            break
-        ret.append(nth_path)
-    if not ret:
-        # No matches.  This should only happen if the file was named, e.g.,
-        # foo.0, and there was no file named foo.  Oh well, try to process it
-        # as a single file.
-        return [path]
-    return ret
-
-
-def load_some_model(path: Path) -> ModelPlus:
-    '''Load a model of any supported format.'''
-    # Be extra-friendly and accept either a file or a directory:
-    if path.is_dir():
-        # Check if it's a set of safetensors files first
-        files = list(path.glob("model-00001-of-*.safetensors"))
-        if not files:
-            # Try the PyTorch patterns too, with lower priority
-            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
-            files = [file for glob in globs for file in path.glob(glob)]
-        if not files:
-            raise Exception(f"Can't find model in directory {path}")
-        if len(files) > 1:
-            raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
-        path = files[0]
-
-    paths = find_multifile_paths(path)
-    models_plus: List[ModelPlus] = []
-    for path in paths:
-        print(f"Loading model file {path}")
-        models_plus.append(lazy_load_file(path))
-
-    model_plus = merge_multifile_models(models_plus)
-    return model_plus
-
-
-def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, SentencePieceVocab]:
-    # Be extra-friendly and accept either a file or a directory.  Also, if it's
-    # a directory, it might be the model directory, and tokenizer.model might
-    # be in the parent of that.
-    if path.is_dir():
-        vocab_file = "tokenizer.model"
-        if vocabtype == 'bpe':
-            vocab_file = "vocab.json"
-        path2 = path / vocab_file
-        # Use `.parent` instead of /.. to handle the symlink case better.
-        path3 = path.parent / vocab_file
-        if path2.exists():
-            path = path2
-        elif path3.exists():
-            path = path3
-        else:
-            raise FileNotFoundError(
-                f"Could not find tokenizer.model in {path} or its parent; "
-                "if it's in another directory, pass the directory as --vocab-dir")
-
-    print(f"Loading vocab file '{path}', type '{vocabtype}'")
-
-    added_tokens_path = path.parent / "added_tokens.json"
-    if vocabtype == "bpe":
-        return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
-    elif vocabtype == "spm":
-        return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
-    else:
-        raise ValueError(f"Unsupported vocabulary type {vocabtype}")
-
-
-def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
-    namestr = {
-        GGMLFileType.AllF32:    "f32",
-        GGMLFileType.MostlyF16: "f16",
-    }[file_type]
-    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
-    if ret in model_paths:
-        sys.stderr.write(
-            f"Error: Default output path ({ret}) would overwrite the input. "
-            "Please explicitly specify a path using --outfile.\n")
-        sys.exit(1)
-    return ret
-
-
-def do_dump_model(model_plus: ModelPlus) -> None:
-    print(f"model_plus.paths = {model_plus.paths!r}")
-    print(f"model_plus.format = {model_plus.format!r}")
-    print(f"model_plus.vocab = {model_plus.vocab!r}")
-    for name, lazy_tensor in model_plus.model.items():
-        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
-
-
-def main(args_in: Optional[List[str]] = None) -> None:
-    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
-    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
-    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
-    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
-    parser.add_argument("--outtype",     choices=["f32", "f16"], help="output format (default: based on input)")
-    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
-    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
-    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
-    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
-    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
-    args = parser.parse_args(args_in)
-
-    if args.dump_single:
-        model_plus = lazy_load_file(args.model)
-        do_dump_model(model_plus)
-
-    model_plus = load_some_model(args.model)
-
-    params = Params.load(model_plus)
-    if params.n_ctx == -1:
-        if args.ctx is None:
-            raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
-                            "Please specify one with --ctx:\n"
-                            " - LLaMA v1: --ctx 2048\n"
-                            " - LLaMA v2: --ctx 4096\n")
-        params.n_ctx = args.ctx
-
-    print(f"params = {params}")
-
-    vocab: Vocab
-    if args.vocab_only:
-        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
-        assert args.outfile, "need --outfile if using --vocab-only"
-        outfile = args.outfile
-        OutputFile.write_vocab_only(outfile, params, vocab)
-        print(f"Wrote {outfile}")
-    else:
-        if args.dump:
-            do_dump_model(model_plus)
-            return
-
-        if model_plus.vocab is not None and args.vocab_dir is None:
-            vocab = model_plus.vocab
-        else:
-            vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
-            vocab = load_vocab(vocab_dir, args.vocabtype)
-
-        model       = model_plus.model
-        model       = convert_model_names(model, params)
-        output_type = pick_output_type(model, args.outtype)
-        model       = convert_to_output_type(model, output_type)
-        outfile     = args.outfile or default_outfile(model_plus.paths, output_type)
-
-        OutputFile.write_all(outfile, params, model, vocab)
-        print(f"Wrote {outfile}")
-
-
-if __name__ == '__main__':
-    main()

From c818c405e025a766c0ab12afb18c4baec7810a6d Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 21 Aug 2023 04:42:09 +0200
Subject: [PATCH 13/25] convert-llama-hf-to-gguf.py : fix attn_q permute

---
 convert-llama-hf-to-gguf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/convert-llama-hf-to-gguf.py b/convert-llama-hf-to-gguf.py
index 9289efd28..6ee5fdf76 100644
--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@@ -264,7 +264,9 @@ for part_name in part_names:
         data = data.squeeze().numpy()
 
         # reverse permute these
-        if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
+        if name.endswith(".q_proj.weight"):
+            data = reverse_hf_permute(data, head_count)
+        if name.endswith(".k_proj.weight"):
             data = reverse_hf_permute(data, head_count, head_count_kv)
 
         # map tensor names

From cb1c0727bd59803b439b6a3af121c99e6393ff3d Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Mon, 21 Aug 2023 11:11:31 +0300
Subject: [PATCH 14/25] HellaSwag: split token evaluation into batches if
 needed (#2681)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 examples/perplexity/perplexity.cpp | 39 +++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 682c39b16..2409db69f 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -122,6 +122,27 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
     printf("\n");
 }
 
+std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
+        int n_vocab, int n_thread) {
+    std::vector<float> result;
+    result.reserve(tokens.size() * n_vocab);
+    size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
+    for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
+        size_t n_tokens = tokens.size() - i_chunk * n_batch;
+        n_tokens = std::min(n_tokens, size_t(n_batch));
+        if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return {};
+        }
+
+        const auto logits = llama_get_logits(ctx);
+        result.insert(result.end(), logits, logits + n_tokens * n_vocab);
+
+        n_past += n_tokens;
+    }
+    return result;
+}
+
 void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     // Calculates hellaswag score (acc_norm) from prompt
     //
@@ -235,15 +256,13 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             query_embd.resize(32);
         }
 
-        // Evaluate the query
-        if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
+        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
+        if (logits.empty()) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return;
         }
 
-        auto query_logits = llama_get_logits(ctx);
-
-        std::memcpy(tok_logits.data(), query_logits + (context_size-1)*n_vocab, n_vocab*sizeof(float));
+        std::memcpy(tok_logits.data(), logits.data() + (context_size-1)*n_vocab, n_vocab*sizeof(float));
         const auto first_probs = softmax(tok_logits);
 
         hs_data[task_idx].ending_logprob_count[0] = 1;
@@ -252,7 +271,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         // Calculate the logprobs over the ending
         for (size_t j = context_size; j < query_size - 1; j++) {
 
-            std::memcpy(tok_logits.data(), query_logits + j*n_vocab, n_vocab*sizeof(float));
+            std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
 
             const float prob = softmax(tok_logits)[query_embd[j + 1]];
 
@@ -271,7 +290,6 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             // Tokenize the query
             query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
             query_size = query_embd.size();
-            //printf("Second query: %d\n",(int)query_size);
 
             // Stop if query wont fit the ctx window
             if (context_size + query_size > (size_t)params.n_ctx) {
@@ -286,19 +304,18 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             //}
 
             // Evaluate the query
-            if (llama_eval(ctx, query_embd.data(), query_embd.size(), context_size, params.n_threads)) {
+            logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads);
+            if (logits.empty()) {
                 fprintf(stderr, "%s : failed to eval\n", __func__);
                 return;
             }
 
-            query_logits = llama_get_logits(ctx);
-
             hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
             hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
 
             // Calculate the logprobs over the ending
             for (size_t j = 0; j < query_size - 1; j++) {
-                std::memcpy(tok_logits.data(), query_logits + j*n_vocab, n_vocab*sizeof(float));
+                std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
 
                 const float prob = softmax(tok_logits)[query_embd[j + 1]];
 

From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001
From: Shouzheng Liu <lshzh.hi@gmail.com>
Date: Mon, 21 Aug 2023 06:59:29 -0400
Subject: [PATCH 15/25] metal : fix synchronization in new matrix
 multiplication kernel (#2686)

---
 ggml-metal.metal | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-metal.metal b/ggml-metal.metal
index 3f3125236..88d48f6c6 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const  uchar * src0,
         threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
                                       + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
         for (int i = 0; i < 8; i++) {
+            threadgroup_barrier(mem_flags::mem_device);
             simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
         }
 
-        threadgroup_barrier(mem_flags::mem_threadgroup);
+        threadgroup_barrier(mem_flags::mem_device);
         device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
         if (sgitg==0) {
             for (int i = 0; i < n_rows; i++) {

From 6a69a693cbd9695faec0c449f237686f3bdfa281 Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 21 Aug 2023 13:23:10 +0200
Subject: [PATCH 16/25] gguf.py : fix rope scale kv

---
 gguf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gguf.py b/gguf.py
index e5eb85ded..d461b8d40 100644
--- a/gguf.py
+++ b/gguf.py
@@ -45,7 +45,7 @@ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
 
 # RoPE
 KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
-KEY_ROPE_SCALE           = "{arch}.rope.scale"
+KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"
 
 # tokenization
 KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
@@ -620,8 +620,8 @@ class GGUFWriter:
         self.add_uint32(
             KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
 
-    def add_rope_scale(self, value:  float):
-        self.add_float32(KEY_ROPE_SCALE.format(arch=self.arch), value)
+    def add_rope_scale_linear(self, value:  float):
+        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
 
     def add_tokenizer_model(self, model: str):
         self.add_string(KEY_TOKENIZER_MODEL, model)

From 5f6ff387ca90c5d6f690030d19751e25e0bbc024 Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 21 Aug 2023 13:25:14 +0200
Subject: [PATCH 17/25] convert-llama-hf-to-gguf.py : rope scale and added
 tokens

---
 convert-llama-hf-to-gguf.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/convert-llama-hf-to-gguf.py b/convert-llama-hf-to-gguf.py
index 6ee5fdf76..3a7d4c6c8 100644
--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@@ -126,6 +126,11 @@ gguf_writer.add_head_count(head_count)
 gguf_writer.add_head_count_kv(head_count_kv)
 gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
 
+if "rope_scaling" in hparams and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+
 
 # TOKENIZATION
 
@@ -155,9 +160,7 @@ if Path(dir_model + "/tokenizer.model").is_file():
         if tokenizer.is_control(i):
             toktype = 3
 
-        # TODO: How to determinate if a token is user defined?
-        # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
-        # if tokenizer.is_user_defined(i): toktype = 4
+        # toktype = 4 is user-defined = tokens from added_tokens.json
 
         if tokenizer.is_unused(i):
             toktype = 5
@@ -168,6 +171,18 @@ if Path(dir_model + "/tokenizer.model").is_file():
         scores.append(score)
         toktypes.append(toktype)
 
+    if Path(dir_model + "/added_tokens.json").is_file():
+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+            addtokens_json = json.load(f)
+
+            print("gguf: get added tokens")
+
+            for key in addtokens_json:
+                tokens.append( key.encode("utf-8") ) 
+                scores.append(-1000.0)
+                toktypes.append(4) # user-defined token type
+
+
     gguf_writer.add_tokenizer_model("llama")
     gguf_writer.add_token_list(tokens)
     gguf_writer.add_token_scores(scores)

From dc1f0510134ba743d5a87bf8c62c23023e1f44f7 Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 21 Aug 2023 13:27:53 +0200
Subject: [PATCH 18/25] convert-llama-7b-pth-to-gguf.py : rope scale and added
 tokens

---
 convert-llama-7b-pth-to-gguf.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py
index 77edd026c..9e2f2099e 100644
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@@ -118,6 +118,11 @@ gguf_writer.add_head_count(head_count)
 gguf_writer.add_head_count_kv(head_count_kv)
 gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
 
+if "rope_scaling" in hparams and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+
 
 # TOKENIZATION
 
@@ -147,9 +152,7 @@ if Path(dir_model + "/tokenizer.model").is_file():
         if tokenizer.is_control(i):
             toktype = 3
 
-        # TODO: How to determinate if a token is user defined?
-        # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
-        # if tokenizer.is_user_defined(i): toktype = 4
+        # toktype = 4 is user-defined = tokens from added_tokens.json
 
         if tokenizer.is_unused(i):
             toktype = 5
@@ -160,6 +163,17 @@ if Path(dir_model + "/tokenizer.model").is_file():
         scores.append(score)
         toktypes.append(toktype)
 
+    if Path(dir_model + "/added_tokens.json").is_file():
+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+            addtokens_json = json.load(f)
+
+            print("gguf: get added tokens")
+
+            for key in addtokens_json:
+                tokens.append( key.encode("utf-8") ) 
+                scores.append(-1000.0)
+                toktypes.append(4) # user-defined token type
+
     gguf_writer.add_tokenizer_model("llama")
     gguf_writer.add_token_list(tokens)
     gguf_writer.add_token_scores(scores)

From c082b9fa0b6b55ed4be0c1f90461f44e266542dc Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 21 Aug 2023 13:30:03 +0200
Subject: [PATCH 19/25] llama.cpp : use rope scale kv

---
 llama.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 86c943fc6..ec954b84f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1435,6 +1435,14 @@ static void llama_model_load_internal(
         hparams.n_head_kv = hparams.n_head;
         GGUF_GET(hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv");
 
+        // TODO: manually setting rope scale should override this
+        // rope_freq_scale (inverse of the kv) is optional
+        float ropescale = 1.0f;
+        GGUF_GET(ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear");
+        if (ropescale != 1.0f) {
+            rope_freq_scale = 1.0f/ropescale;
+        }
+
         // get general kv
         GGUF_GET(general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
         GGUF_GET(general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");

From 9070e330abc410e21d3296d7ca1a4d5f8facc8f4 Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 21 Aug 2023 14:11:22 +0200
Subject: [PATCH 20/25] convert-llama-7b-pth-to-gguf.py : rope scale fix

---
 convert-llama-7b-pth-to-gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py
index 9e2f2099e..3a606b55d 100644
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@@ -118,7 +118,7 @@ gguf_writer.add_head_count(head_count)
 gguf_writer.add_head_count_kv(head_count_kv)
 gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
 
-if "rope_scaling" in hparams and "factor" in hparams["rope_scaling"]:
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
     if "type" in hparams["rope_scaling"]:
         if hparams["rope_scaling"]["type"] == "linear":
             gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])

From 7a7d1ba68ab511269f5e686b7d4d2b7c6b9a3e45 Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Mon, 21 Aug 2023 14:12:02 +0200
Subject: [PATCH 21/25] convert-llama-hf-to-gguf.py : rope scale fix

---
 convert-llama-hf-to-gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert-llama-hf-to-gguf.py b/convert-llama-hf-to-gguf.py
index 3a7d4c6c8..d5b3897c7 100644
--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@@ -126,7 +126,7 @@ gguf_writer.add_head_count(head_count)
 gguf_writer.add_head_count_kv(head_count_kv)
 gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
 
-if "rope_scaling" in hparams and "factor" in hparams["rope_scaling"]:
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
     if "type" in hparams["rope_scaling"]:
         if hparams["rope_scaling"]["type"] == "linear":
             gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])

From 6490ff7198ced6fed47865d90eed866817cad7da Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 21 Aug 2023 16:42:27 +0300
Subject: [PATCH 22/25] py : fix whitespace

---
 convert-falcon-hf-to-gguf.py    | 2 +-
 convert-llama-7b-pth-to-gguf.py | 2 +-
 convert-llama-hf-to-gguf.py     | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py
index e0da3a04d..b3e190a0f 100644
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -250,7 +250,7 @@ for part_name in part_names:
             sys.exit()
 
         n_dims = len(data.shape)
-        data_dtype = data.dtype 
+        data_dtype = data.dtype
 
         # if f32 desired, convert any float16 to float32
         if ftype == 0 and data_dtype == np.float16:
diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py
index 3a606b55d..ab5c80b69 100644
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@@ -170,7 +170,7 @@ if Path(dir_model + "/tokenizer.model").is_file():
             print("gguf: get added tokens")
 
             for key in addtokens_json:
-                tokens.append( key.encode("utf-8") ) 
+                tokens.append( key.encode("utf-8") )
                 scores.append(-1000.0)
                 toktypes.append(4) # user-defined token type
 
diff --git a/convert-llama-hf-to-gguf.py b/convert-llama-hf-to-gguf.py
index d5b3897c7..f8cfdaa80 100644
--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@@ -178,7 +178,7 @@ if Path(dir_model + "/tokenizer.model").is_file():
             print("gguf: get added tokens")
 
             for key in addtokens_json:
-                tokens.append( key.encode("utf-8") ) 
+                tokens.append( key.encode("utf-8") )
                 scores.append(-1000.0)
                 toktypes.append(4) # user-defined token type
 
@@ -294,7 +294,7 @@ for part_name in part_names:
             sys.exit()
 
         n_dims = len(data.shape)
-        data_dtype = data.dtype 
+        data_dtype = data.dtype
 
         # if f32 desired, convert any float16 to float32
         if ftype == 0 and data_dtype == np.float16:

From e06cbcee73dfd32ea2c159a79da62b3a5afc7640 Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Mon, 21 Aug 2023 08:45:52 -0600
Subject: [PATCH 23/25] gguf : add Python script to convert GGMLv3 LLaMA models
 to GGUF (#2682)

* First pass at converting GGMLv3 LLaMA models to GGUF

* Cleanups, better output during conversion

* Fix vocab space conversion logic

* More vocab conversion fixes

* Add description to converted GGUF files

* Improve help text, expand warning

* Allow specifying name and description for output GGUF

* Allow overriding vocab and hyperparams from original model metadata

* Use correct params override var name

* Fix wrong type size for Q8_K

Better handling of original style metadata

* Set default value for gguf add_tensor raw_shape KW arg
---
 convert-llama-ggmlv3-to-gguf.py | 334 ++++++++++++++++++++++++++++++++
 gguf.py                         |  52 +++--
 2 files changed, 374 insertions(+), 12 deletions(-)
 create mode 100644 convert-llama-ggmlv3-to-gguf.py

diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggmlv3-to-gguf.py
new file mode 100644
index 000000000..30038072f
--- /dev/null
+++ b/convert-llama-ggmlv3-to-gguf.py
@@ -0,0 +1,334 @@
+import sys, struct, math, argparse
+from pathlib import Path
+
+import numpy as np
+
+import gguf
+
+# Note: Does not support GGML_QKK_64
+QK_K = 256
+# Items here are (block size, type size)
+GGML_QUANT_SIZES = {
+    gguf.GGMLQuantizationType.F32  : (1, 4),
+    gguf.GGMLQuantizationType.F16  : (1, 2),
+    gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
+    gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
+    gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
+    gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
+    gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
+    gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
+    gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
+    gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
+    gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
+    gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
+    gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
+    gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
+}
+
+class Hyperparameters:
+    def __init__(self):
+        self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
+        self.n_ff = 0
+
+    def set_n_ff(self, model):
+        ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
+        assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
+        ff_tensor = model.tensors[ff_tensor_idx]
+        self.n_ff = ff_tensor.dims[1]
+
+    def load(self, data, offset):
+        (
+            self.n_vocab,
+            self.n_embd,
+            self.n_mult,
+            self.n_head,
+            self.n_layer,
+            self.n_rot,
+            self.ftype,
+        ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
+        return 4 * 7
+
+    def __str__(self):
+        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
+
+class Vocab:
+    def __init__(self):
+        self.items = []
+
+    def load(self, data, offset, n_vocab):
+        orig_offset = offset
+        for _ in range(n_vocab):
+            itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
+            assert itemlen < 4096, 'Absurd vocab item length'
+            offset += 4
+            vocab = bytes(data[offset:offset + itemlen])
+            offset += itemlen
+            score = struct.unpack('<f', data[offset:offset + 4])[0]
+            offset += 4
+            self.items.append((vocab, score))
+        return offset - orig_offset
+
+class Tensor:
+    def __init__(self):
+        self.name = None
+        self.dims = ()
+        self.dtype = None
+        self.start_offset = 0
+        self.len_bytes = 0
+
+    def load(self, data, offset):
+        orig_offset = offset
+        (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
+        assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
+        assert name_len < 4096, 'Absurd tensor name length'
+        quant = GGML_QUANT_SIZES.get(dtype)
+        assert quant is not None, 'Unknown tensor type'
+        (blksize, tysize) = quant
+        offset += 12
+        self.dtype= dtype
+        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
+        offset += 4 * n_dims
+        self.name = bytes(data[offset:offset + name_len])
+        offset += name_len
+        pad = ((offset + 31) & ~31) - offset
+        offset += pad
+        n_elems = np.prod(self.dims)
+        n_bytes = (n_elems * tysize) // blksize
+        self.start_offset = offset
+        self.len_bytes = n_bytes
+        offset += n_bytes
+        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
+        return offset - orig_offset
+
+class GGMLV3Model:
+    def __init__(self):
+        self.hyperparameters = None
+        self.vocab = None
+        self.tensor_map = {}
+        self.tensors = []
+
+    def validate_header(self, data, offset):
+        if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
+            raise ValueError('Only GGJTv3 supported')
+        return 8
+
+    def load(self, data, offset):
+        offset += self.validate_header(data, offset)
+        hp = Hyperparameters()
+        offset += hp.load(data, offset)
+        vocab = Vocab()
+        offset += vocab.load(data, offset, hp.n_vocab)
+        tensors = []
+        tensor_map = {}
+        while offset < len(data):
+            tensor = Tensor()
+            offset += tensor.load(data, offset)
+            tensor_map[tensor.name] = len(tensors)
+            tensors.append(tensor)
+        self.hyperparameters = hp
+        self.vocab = vocab
+        self.tensors = tensors
+        self.tensor_map = tensor_map
+        hp.set_n_ff(self)
+        return offset
+
+class GGMLToGGUF:
+    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
+        hp = ggml_model.hyperparameters
+        self.model = ggml_model
+        self.data = data
+        self.cfg = cfg
+        self.params_override = params_override
+        self.vocab_override = vocab_override
+        if params_override is not None:
+            n_kv_head = params_override.n_head_kv
+        else:
+            if cfg.gqa == 1:
+                n_kv_head = hp.n_head
+            else:
+                gqa = float(cfg.gqa)
+                n_kv_head = None
+                for x in range(1, 256):
+                    if float(hp.n_head) / float(x) == gqa:
+                        n_kv_head = x
+                assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
+                print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
+        self.n_kv_head = n_kv_head
+        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
+
+    def save(self):
+        print('* Preparing to save GGUF file')
+        gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
+        self.add_params(gguf_writer)
+        self.add_vocab(gguf_writer)
+        self.add_tensors(gguf_writer)
+        print("    gguf: write header")
+        gguf_writer.write_header_to_file()
+        print("    gguf: write metadata")
+        gguf_writer.write_kv_data_to_file()
+        print("    gguf: write tensors")
+        gguf_writer.write_tensors_to_file()
+        gguf_writer.close()
+
+    def add_params(self, gguf_writer):
+        hp = self.model.hyperparameters
+        cfg = self.cfg
+        desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
+        try:
+            # Filenames aren't necessarily valid UTF8.
+            name = cfg.name if cfg.name is not None else cfg.input.name
+        except UnicodeDecodeError:
+            name = None
+        print('* Adding model parameters and KV items')
+        if name is not None:
+            gguf_writer.add_name(name)
+        gguf_writer.add_description(desc)
+        if self.params_override is not None:
+            po = self.params_override
+            assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
+            assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
+            assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
+            gguf_writer.add_context_length      (po.n_ctx)
+            gguf_writer.add_embedding_length    (po.n_embd)
+            gguf_writer.add_block_count         (po.n_layer)
+            gguf_writer.add_feed_forward_length (po.n_ff)
+            gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
+            gguf_writer.add_head_count          (po.n_head)
+            gguf_writer.add_head_count_kv       (po.n_head_kv)
+            gguf_writer.add_layer_norm_rms_eps  (po.f_norm_eps)
+            return
+        gguf_writer.add_context_length(cfg.context_length)
+        gguf_writer.add_embedding_length(hp.n_embd)
+        gguf_writer.add_block_count(hp.n_layer)
+        gguf_writer.add_feed_forward_length(hp.n_ff)
+        gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
+        gguf_writer.add_head_count(hp.n_head)
+        gguf_writer.add_head_count_kv(self.n_kv_head)
+        gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
+
+    def add_vocab(self, gguf_writer):
+        hp = self.model.hyperparameters
+        gguf_writer.add_tokenizer_model('llama')
+        tokens = []
+        scores = []
+        toktypes = []
+        if self.vocab_override is not None:
+            vo = self.vocab_override
+            print('* Adding vocab item(s)')
+            for (idx, vitem) in enumerate(vo.all_tokens()):
+                if len(vitem) == 3:
+                    tokens.append(vitem[0])
+                    scores.append(vitem[1])
+                    toktypes.append(vitem[2])
+                else:
+                    # Maybe try to guess the token type here?
+                    tokens.append(vitem[0])
+                    scores.append(vitem[1])
+            assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
+            gguf_writer.add_token_list(tokens)
+            gguf_writer.add_token_scores(scores)
+            if len(toktypes) > 0:
+                gguf_writer.add_token_types(toktypes)
+            return
+        print(f'* Adding {hp.n_vocab} vocab item(s)')
+        for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
+            tt = 1 # Normal
+            if len(vbytes) == 0:
+                tt = 3 # Control
+            elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
+                hv = hex(vbytes[0])[2:].upper()
+                vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
+                tt = 6 # Byte
+            else:
+                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
+            toktypes.append(tt)
+            tokens.append(vbytes)
+            scores.append(vscore)
+        gguf_writer.add_token_list(tokens)
+        gguf_writer.add_token_scores(scores)
+        gguf_writer.add_token_types(toktypes)
+
+    def add_tensors(self, gguf_writer):
+        nm = self.name_map
+        data = self.data
+        print(f'* Adding {len(self.model.tensors)} tensor(s)')
+        for tensor in self.model.tensors:
+            name = str(tensor.name, 'UTF-8')
+            if name.endswith('.weight'):
+                name = name[:-7]
+                suffix = '.weight'
+            elif name.endswith('.bias'):
+                name = name[:-5]
+                suffix = '.bias'
+            mapped_name = nm.get(name)
+            assert mapped_name is not None, f'Bad name {name}'
+            mapped_name += suffix
+            tempdims = list(tensor.dims[:])
+            if len(tempdims) > 1:
+                temp = tempdims[1]
+                tempdims[1] = tempdims[0]
+                tempdims[0] = temp
+            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
+            gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
+
+def handle_metadata(cfg, hp):
+    import convert
+    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
+    hf_config_path   = cfg.model_metadata_dir / "config.json"
+    orig_config_path = cfg.model_metadata_dir / "params.json"
+    # We pass a fake model here. "original" mode will check the shapes of some
+    # tensors if information is missing in the .json file: other than that, the
+    # model data isn't used so this should be safe (at least for now).
+    fakemodel = {
+        'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
+        'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
+    }
+    fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
+    fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
+    if hf_config_path.exists():
+        params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
+    elif orig_config_path.exists():
+        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
+    else:
+        raise ValueError('Unable to load metadata')
+    vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
+    convert.check_vocab_size(params, vocab)
+    return (params, vocab)
+
+def handle_args():
+    parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
+    parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
+    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
+    parser.add_argument('--name', help = 'Set model name')
+    parser.add_argument('--desc', help = 'Set model description')
+    parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
+    parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
+    parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
+    parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
+    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
+    parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
+    return parser.parse_args()
+
+def main():
+    cfg = handle_args()
+    print(f'* Using config: {cfg}')
+    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
+    data = np.memmap(cfg.input, mode = 'r')
+    model = GGMLV3Model()
+    print('* Scanning GGML input file')
+    offset = model.load(data, 0)
+    print(f'* GGML model hyperparameters: {model.hyperparameters}')
+    vocab_override = None
+    params_override = None
+    if cfg.model_metadata_dir is not None:
+        (params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
+        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
+        print(f'* Overriding params: {params_override}')
+        print(f'* Overriding vocab: {vocab_override}')
+    else:
+        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
+    converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
+    converter.save()
+    print(f'* Successful completion. Output saved to: {cfg.output}')
+
+main()
diff --git a/gguf.py b/gguf.py
index d461b8d40..60ee52f09 100644
--- a/gguf.py
+++ b/gguf.py
@@ -5,7 +5,7 @@ import tempfile
 import numpy as np
 
 from enum import IntEnum, auto
-from typing import Any, IO, List
+from typing import Any, IO, List, Optional
 
 #
 # constants
@@ -325,8 +325,20 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
 
 
 class GGMLQuantizationType(IntEnum):
-    F32 = 0
-    F16 = 1
+    F32  = 0
+    F16  = 1
+    Q4_0 = 2
+    Q4_1 = 3
+    Q5_0 = 6
+    Q5_1 = 7
+    Q8_0 = 8
+    Q8_1 = 9
+    Q2_K = 10
+    Q3_K = 11
+    Q4_K = 12
+    Q5_K = 13
+    Q6_K = 14
+    Q8_K = 15
 
 
 class GGUFValueType(IntEnum):
@@ -359,7 +371,7 @@ class GGUFValueType(IntEnum):
 
 
 class GGUFWriter:
-    def __init__(self, path: str, arch: str):
+    def __init__(self, path: str, arch: str, use_temp_file = True):
         self.fout = open(path, "wb")
         self.arch = arch
         self.offset_tensor = 0
@@ -369,6 +381,8 @@ class GGUFWriter:
         self.ti_data = b""
         self.ti_data_count = 0
         self.add_architecture()
+        self.use_temp_file = use_temp_file
+        self.tensors = []
 
     def write_header_to_file(self):
         self.fout.write(struct.pack("<I", GGUF_MAGIC))
@@ -476,8 +490,8 @@ class GGUFWriter:
     def ggml_pad(x: int, n: int) -> int:
         return ((x + n - 1) // n) * n
 
-    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int):
-        assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
+    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
+        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
 
         encoded_name = name.encode("utf8")
         self.ti_data += struct.pack("<I", len(encoded_name))
@@ -486,23 +500,30 @@ class GGUFWriter:
         self.ti_data += struct.pack("<I", n_dims)
         for i in range(n_dims):
             self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
-
-        dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+        if raw_dtype is None:
+            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+        else:
+            dtype = raw_dtype
         self.ti_data += struct.pack("<I", dtype)
         self.ti_data += struct.pack("<Q", self.offset_tensor)
         self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
         self.ti_data_count += 1
 
-    def add_tensor(self, name: str, tensor: np.ndarray):
-        if not hasattr(self, "temp_file"):
+    def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
+        if self.use_temp_file and not hasattr(self, "temp_file"):
             self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
             self.temp_file.seek(0)
 
-        self.add_tensor_info(name, tensor.shape, tensor.dtype, tensor.nbytes)
+        self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
+
+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
+
+        if not self.use_temp_file:
+            self.tensors.append((tensor, pad))
+            return
 
         tensor.tofile(self.temp_file)
 
-        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
         if pad != 0:
             self.temp_file.write(bytes([0] * pad))
 
@@ -524,6 +545,13 @@ class GGUFWriter:
         if pad != 0:
             self.fout.write(bytes([0] * pad))
 
+        if not self.use_temp_file:
+            for (currtensor, currpad) in self.tensors:
+                currtensor.tofile(self.fout)
+                if currpad != 0:
+                    self.fout.write(bytes([0] * currpad))
+            return
+
         self.temp_file.seek(0)
 
         shutil.copyfileobj(self.temp_file, self.fout)

From 8d177eddeb8c0b284a47a6840ca820ed846708c1 Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Mon, 21 Aug 2023 17:56:02 +0200
Subject: [PATCH 24/25] llama : improve token type support (#2668)

* Merge tokenizer fixes into the gguf branch.

* Add test vocabularies

* Adapt convert-new.py (and fix a clang-cl compiler error on windows)

* Improved tokenizer test

But does it work on MacOS?

* Improve token type support

- Added @klosax code to convert.py
- Improved token type support in vocabulary

* Exclude platform dependent tests

* More sentencepiece compatibility by eliminating magic numbers

* Restored accidentally removed comment
---
 convert.py                   |  30 ++++++--
 llama.cpp                    | 128 ++++++++++++++---------------------
 models/ggml-vocab-llama.gguf | Bin 467382 -> 595423 bytes
 tests/test-tokenizer-1.cpp   |  34 ++++++----
 4 files changed, 94 insertions(+), 98 deletions(-)

diff --git a/convert.py b/convert.py
index df589928b..f680f8596 100755
--- a/convert.py
+++ b/convert.py
@@ -261,12 +261,12 @@ class BpeVocab:
         for i, item in enumerate(tokenizer):
             text: bytes = item.encode("utf-8")
             score: float = -i
-            yield text, score
+            yield text, score, 4
 
     def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
         for text in self.added_tokens_list:
             score = -1000.0
-            yield text.encode("utf-8"), score
+            yield text.encode("utf-8"), score, 4
 
     def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
         yield from self.bpe_tokens()
@@ -303,12 +303,28 @@ class SentencePieceVocab:
             piece = tokenizer.id_to_piece(i)
             text: bytes = piece.encode("utf-8")
             score: float = tokenizer.get_score(i)
-            yield text, score
+
+            toktype = 1  # defualt to normal token type
+            if tokenizer.is_unknown(i):
+                toktype = 2
+            if tokenizer.is_control(i):
+                toktype = 3
+
+            # NOTE: I think added_tokens are user defined.
+            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
+            # if tokenizer.is_user_defined(i): toktype = 4
+
+            if tokenizer.is_unused(i):
+                toktype = 5
+            if tokenizer.is_byte(i):
+                toktype = 6
+
+            yield text, score, toktype
 
     def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
         for text in self.added_tokens_list:
             score = -1000.0
-            yield text.encode("utf-8"), score
+            yield text.encode("utf-8"), score, 4
 
     def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
         yield from self.sentencepiece_tokens()
@@ -721,14 +737,16 @@ class OutputFile:
     def add_meta_vocab(self, vocab: Vocab) -> None:
         tokens = []
         scores = []
-        for text, score in vocab.all_tokens():
+        toktypes = []
+        for text, score, toktype in vocab.all_tokens():
             tokens.append(text)
             scores.append(score)
+            toktypes.append(toktype)
 
         self.gguf.add_tokenizer_model("llama")
         self.gguf.add_token_list(tokens)
         self.gguf.add_token_scores(scores)
-        #self.gguf.add_token_types(toktypes) # TODO: add this
+        self.gguf.add_token_types(toktypes)
 
         # TODO: added / special tokens
 
diff --git a/llama.cpp b/llama.cpp
index ec954b84f..1785025f0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -772,15 +772,16 @@ struct llama_vocab {
     using id    = int32_t;
     using token = std::string;
 
-    struct token_score {
+    struct token_data {
         token tok;
         float score;
+        int toktype;
     };
 
     llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
 
     std::unordered_map<token, id> token_to_id;
-    std::vector<token_score>      id_to_token;
+    std::vector<token_data>       id_to_token;
 
     // default LLaMA special tokens
     id special_bos_id = 1;
@@ -1507,17 +1508,25 @@ static void llama_model_load_internal(
 
         const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
 
+        const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type");
+        if (toktype_idx == -1) {
+            throw std::runtime_error("cannot find token type list in GGUF file\n");
+        }
+
+        const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
+
         for (uint32_t i = 0; i < hparams.n_vocab; i++) {
             std::string word = gguf_get_arr_str(ctx, token_idx, i);
 
             vocab.token_to_id[word] = i;
 
-            auto & tok_score = vocab.id_to_token[i];
-            tok_score.tok = std::move(word);
-            tok_score.score = scores[i];
+            auto & token_data = vocab.id_to_token[i];
+            token_data.tok = std::move(word);
+            token_data.score = scores[i];
+            token_data.toktype = toktypes[i];
 
             // determine the newline token: 0x0A == 10 == '\n'
-            if (tok_score.tok == "<0x0A>") {
+            if (token_data.tok == "<0x0A>") {
                 vocab.linefeed_id = i;
             }
         }
@@ -2345,92 +2354,57 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
     return vocab.type;
 }
 
-static bool llama_is_normal_token(const llama_vocab & vocab, llama_token token) {
-    if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_SPM) {
-        return token >= 259;
-    }
-
-    if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_BPE) {
-        return token >= 95;
-    }
-
-    return false;
+static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].toktype == 1;
 }
 
-static bool llama_is_bos_token(const llama_vocab & vocab, llama_token token) {
-    return token == vocab.special_bos_id;
+static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].toktype == 2;
 }
 
-static bool llama_is_eos_token(const llama_vocab & vocab, llama_token token) {
-    return token == vocab.special_eos_id;
+static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].toktype == 3;
 }
 
-static bool llama_is_control_token(const llama_vocab & vocab, llama_token token) {
-    if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_SPM) {
-        return token == llama_is_bos_token(vocab, token) || token == llama_is_eos_token(vocab, token);
-    }
-
-    // TODO: improve?
-    return false;
+static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(llama_is_control_token(vocab, id));
+    return id == vocab.special_bos_id;
 }
 
-static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token token) {
-    if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_SPM) {
-        return token == 0;
-    }
-
-    // TODO: improve?
-    return false;
+static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
+    GGML_ASSERT(llama_is_control_token(vocab, id));
+    return id == vocab.special_eos_id;
 }
 
-static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token token) {
-    GGML_UNUSED(vocab);
-    GGML_UNUSED(token);
-    // TODO: improve?
-    return false;
+static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
+    GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
+    return id == vocab.special_pad_id;
 }
 
-static bool llama_is_unused_token(const llama_vocab & vocab, llama_token token) {
-    GGML_UNUSED(vocab);
-    GGML_UNUSED(token);
-    // TODO: improve?
-    return false;
+static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].toktype == 4;
 }
 
-static bool llama_is_byte_token(const llama_vocab & vocab, llama_token token) {
-    if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_SPM) {
-        return 3 <= token && token < 259;
-    }
-
-    if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_BPE) {
-        return 1 <= token && token < 95;
-    }
-
-    return false;
+static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].toktype == 5;
 }
 
-static uint8_t llama_byte_to_char(const llama_vocab & vocab, uint8_t byte) {
-    if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_SPM) {
-        return byte - 3;
-    }
-
-    if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_BPE) {
-        return byte + 32;
-    }
-
-    return false;
+static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].toktype == 6;
 }
 
-static uint8_t llama_char_to_byte(const llama_vocab & vocab, uint8_t ch) {
-    if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_SPM) {
-        return ch + 3;
-    }
+static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(llama_is_byte_token(vocab, id));
+    const auto& token_data = vocab.id_to_token.at(id);
+    auto buf = token_data.tok.substr(3, 2);
+    return strtol(buf.c_str(), NULL, 16);
+}
 
-    if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_BPE) {
-        return ch - 32;
-    }
-
-    return false;
+static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
+    char buf[7];
+    int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
+    GGML_ASSERT(0 <= result && result < 7);
+    return vocab.token_to_id.at(buf);
 }
 
 static std::string llama_escape_whitespace(const std::string& text) {
@@ -2569,7 +2543,7 @@ private:
         if (p == rev_merge.end()) {
             // output any symbols that did not form tokens as bytes.
             for (int j = 0; j < (int)symbol.n; ++j) {
-                llama_vocab::id token_id = llama_char_to_byte(vocab_, symbol.text[j]);
+                llama_vocab::id token_id = llama_byte_to_token(vocab_, symbol.text[j]);
                 output.push_back(token_id);
             }
             return;
@@ -2595,12 +2569,12 @@ private:
             return;
         }
 
-        const auto &tok_score = vocab_.id_to_token[(*token).second];
+        const auto &tok_data = vocab_.id_to_token[(*token).second];
 
         llama_sp_bigram bigram;
         bigram.left = left;
         bigram.right = right;
-        bigram.score = tok_score.score;
+        bigram.score = tok_data.score;
         bigram.size = text.size();
         work_queue_.push(bigram);
 
@@ -5109,7 +5083,7 @@ int llama_token_to_str_with_model(const struct llama_model * model, llama_token
             if (length < 1) {
                 return -1;
             }
-            buf[0] = llama_byte_to_char(model->vocab, token);
+            buf[0] = llama_token_to_byte(model->vocab, token);
             return 1;
         }
     }
diff --git a/models/ggml-vocab-llama.gguf b/models/ggml-vocab-llama.gguf
index c50db67dc52023444e607bde7b684da5d631e564..63bfaf672f382c0f5bbcffe54736e2698ef3ac55 100644
GIT binary patch
delta 129117
zcmeI(y=qf&0Egk6endOu0tyZe&fUZ-&|ZM(YL-yMPo#^2oI%jp>u_?>$;k`xR{Td@
zU6R1NJi?)fG^c&jlRW2F{Py$KuP=92mzR$oOpCvNR-ae*r<-3No=qn|?tQ%bVLg3X
zPut^Y9LM808OP~3mfzRYjmNj=^YMAtubypQK7F@&b8&U`;`zm&zdpHs`+9RW{_%AC
z!0#7}qw(SR-?1DY4|B|idY~O2XPo~)-^p%|Gan}7ZYLk-p40DsInLjH0iOSTegL<x
zI{O`?|9%g!=jU&q|MCl%%af0D&*^tRZLj~dfagEW@A+@@yZ*iYrv<$Jr}<s~ZT<!S
z@n2bh|Jxf7*MIrbznI+rwKpLA*8;@rzt{h=fa~A&Ul#EG@BP0l!2kST7T~{9KjZhW
z<%`cax98vUUl!p1QopW${%_xaT>q~BvViN~^<Ng?|MKqF@BiHYm3Kex|M{=fkNZFV
zEA`|5fAAmwl?C{Z|H=aV$A4u3{^P&00RNTxvH$U3sUQ0v|CRc&|M6d`ANwEw@n2bh
z|M;&gz<>N#7T`brD+};nsUQ0v|CRc&|M6d`ANwEwmHM&&@gM({1^AEu$^!hye`Nvw
z<G->1|CRc&|M6d`ANwEwmHM&&@n5MQ`yc=DUs-_v_^&L$fBaV#;6MH=3-Dj5ANwEw
zmHM&&@n5MQ`yc<6`mz7<AODpF_>cd}0{q8+WdZ)<zp?=TmHM&&@n5MQ`yc<6`mz7<
zU#TDaAOG=RS%CleuPnfS{8tv>KmIEV@L#DP`yc<6`mz7<U#TDaAODs5vH$TO|CI&!
zkN?U7{KtP~0siB^vH<^;`mz7<U#TDaAODs5vH$U3sUQ0v|M6d0fdBZfEWm&KR~Fzu
z{woXcU#TDaAODs5vH$U3sUQ0v|CRc&|M4IHl?C{Z|H=aV$A4u3{^P&00RNTxvH$U3
zsUQ0v|CRc&|M6d`ANwEw@n2bh|M;&gz<>N#7T`brD+};nsUQ0v|CRc&|M6d`ANwEw
zmHM&&@gM({1^AEu$^!hye`Nvw<G->1|CRc&|M6d`ANwEwmHM&&@n5MQ`yc=DUs-_v
z_^&L$fBaV#;6MH=3-Dj5ANwEwmHM&&@n5MQ`yc<6`mz7<AODpF_>cd}0{q8+WdZ)<
zzp?=TmHM&&@n5MQ`yc<6`mz7<U#TDaAOG=RS%CleuPnfS{8tv>KmIEV@L#DP`yc<6
z`mz7<U#TDaAODs5vH$TO|CI&!kN?U7{KtP~0siB^vH<^;`mz7<U#TDaAODs5vH$U3
zsUQ0v|M6d0fdBZfEWm&KR~Fzu{woXcU#TDaAODs5vH$U3sUQ0v|CRc&|M4IHl?C{Z
z|H=aV$A4u3{^P&00RNTxvH$U3sUQ0v|CRc&|M6d`ANwEw@n2bh|M;&gz<>N#7T`br
zD+};nsUQ0v|CRc&|M6d`ANwEwmHM&&@gM({1^AEu$^!hye`Nvw<G->1|CRc&|M6d`
zANwEwmHM&&@n5MQ`yc=DUs-_v_^&L$fBaV#;6MH=3-Dj5ANwEwmHM&&@n5MQ`yc<6
z`mz7<AODpF_>cd}0{q8+WdZ)<zp?=TmHM&&@n5MQ`yc<6`mz7<U#TDaAOG=RS%Cle
zuPnfS{8tv>KmIEV@L#DP`yc<6`mz7<U#TDaAODs5vH$TO|CI&!kN?U7{MY*L{J$?g
FzX4J`kwX9g

delta 67
zcmcb=T4mc|8D4kyP&Y;f2;iB>>&hswF_@E)WzA7_^X3G`_5?-{W&&bnAZ7t#Rv>2E
Kp1{Z+rT_p&>=0T2

diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp
index 5841f7339..a8a7e8898 100644
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@@ -10,10 +10,6 @@
 #include <vector>
 #include <locale>
 
-static std::string vocab_type(llama_context * ctx) {
-    return llama_n_vocab(ctx) == 32000 ? "spm": "bpe";
-}
-
 static std::string escape_whitespace(const std::string& text) {
     std::string result;
     bool escaping = false;
@@ -91,8 +87,8 @@ int main(int argc, char **argv) {
                 return 2;
             }
         } else {
-            if ((vocab_type(ctx) == "spm" && i <= 258) ||
-                (vocab_type(ctx) == "bpe" && (i == 0 || i >= 100000))) {
+            // TODO: needs access to token types
+            if (0 <= i && i < 259) {
                 fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
                     __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
             } else {
@@ -103,20 +99,28 @@ int main(int argc, char **argv) {
         }
     }
 
-    std::wstring_convert<typename std::codecvt_utf8<wchar_t>, wchar_t> converter;
-    for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) {
-        std::wstring wstr(1, ch);
-        std::string str;
-        try {
-            str = converter.to_bytes(wstr);
-        } catch (std::exception & e) {
-            continue;
+#ifdef _WIN32
+    std::wstring_convert<typename std::codecvt_utf8<char16_t>, char16_t> u16converter;
+    for (char16_t ch = 0x0000; ch < 0xffff; ++ch) {
+        std::u16string u16str(1, ch);
+        std::string str = u16converter.to_bytes(u16str);
+        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
+        if (tokens.size() == 1) {
+            fprintf(stderr, "%s : info: %s tokenized to %d \n",
+                __func__, str.c_str(), tokens[0]);
         }
-        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str), false);
+    }
+
+    std::wstring_convert<typename std::codecvt_utf8<char32_t>, char32_t> u32converter;
+    for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) {
+        std::u32string u32str(1, ch);
+        std::string str = u32converter.to_bytes(u32str);
+        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
         if (tokens.size() == 1) {
             fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
         }
     }
+#endif
 
     llama_free_model(model);
     llama_free(ctx);

From 0b53b8b08d951da4b24f80e06a7982c3394d66f4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 21 Aug 2023 19:35:31 +0300
Subject: [PATCH 25/25] llama : add API for token type

ggml-ci
---
 convert.py                                    | 33 +++----
 .../convert-llama2c-to-ggml.cpp               | 44 +++++-----
 .../train-text-from-scratch.cpp               | 33 +++----
 gguf.py                                       | 10 +++
 llama.cpp                                     | 85 +++++++++----------
 llama.h                                       | 26 +++---
 6 files changed, 115 insertions(+), 116 deletions(-)

diff --git a/convert.py b/convert.py
index f680f8596..4ba36f280 100755
--- a/convert.py
+++ b/convert.py
@@ -241,17 +241,19 @@ class BpeVocab:
             added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
         else:
             added_tokens = {}
+
         vocab_size: int = len(self.bpe_tokenizer)
-        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids = sorted(added_tokens.values())
+        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids      = sorted(added_tokens.values())
         if expected_ids != actual_ids:
             raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+
         items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_list = [text for (text, idx) in items]
+        self.added_tokens_list    = [text for (text, idx) in items]
         self.vocab_size_base: int = vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
+        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer      = fname_tokenizer
+        self.fname_added_tokens   = fname_added_tokens
 
     def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
         tokenizer = self.bpe_tokenizer
@@ -261,12 +263,12 @@ class BpeVocab:
         for i, item in enumerate(tokenizer):
             text: bytes = item.encode("utf-8")
             score: float = -i
-            yield text, score, 4
+            yield text, score, gguf.TokenType.USER_DEFINED
 
     def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
         for text in self.added_tokens_list:
             score = -1000.0
-            yield text.encode("utf-8"), score, 4
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
 
     def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
         yield from self.bpe_tokens()
@@ -304,27 +306,27 @@ class SentencePieceVocab:
             text: bytes = piece.encode("utf-8")
             score: float = tokenizer.get_score(i)
 
-            toktype = 1  # defualt to normal token type
+            toktype = gguf.TokenType.NORMAL
             if tokenizer.is_unknown(i):
-                toktype = 2
+                toktype = gguf.TokenType.UNKNOWN
             if tokenizer.is_control(i):
-                toktype = 3
+                toktype = gguf.TokenType.CONTROL
 
             # NOTE: I think added_tokens are user defined.
             # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
-            # if tokenizer.is_user_defined(i): toktype = 4
+            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
 
             if tokenizer.is_unused(i):
-                toktype = 5
+                toktype = gguf.TokenType.UNUSED
             if tokenizer.is_byte(i):
-                toktype = 6
+                toktype = gguf.TokenType.BYTE
 
             yield text, score, toktype
 
     def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
         for text in self.added_tokens_list:
             score = -1000.0
-            yield text.encode("utf-8"), score, 4
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
 
     def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
         yield from self.sentencepiece_tokens()
@@ -725,6 +727,7 @@ class OutputFile:
         self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 
     def add_meta_arch(self, params: Params) -> None:
+        self.gguf.add_name                ("llama")
         self.gguf.add_context_length      (params.n_ctx)
         self.gguf.add_embedding_length    (params.n_embd)
         self.gguf.add_block_count         (params.n_layer)
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index af493e15b..469d6e3de 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -139,14 +139,16 @@ void print_sample_weights(TransformerWeights *w){
 struct llama_vocab {
     using id    = int32_t;
     using token = std::string;
+    using ttype = llama_token_type;
 
-    struct token_score {
-        token tok;
+    struct token_data {
+        token text;
         float score;
+        ttype type;
     };
 
     std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
+    std::vector<token_data> id_to_token;
 };
 
 struct my_llama_hparams {
@@ -516,36 +518,30 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
         struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
         struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
 
-        std::vector<const char *> strings;
-        std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);
-        strings.resize(n_vocab, NULL);
-        scores.resize(n_vocab, 0);
-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        const int n_vocab = llama_n_vocab(lctx);
         vocab->id_to_token.resize(n_vocab);
         for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
-            float       score = scores[i];
-            vocab->id_to_token[i].tok   = tok;
-            vocab->id_to_token[i].score = score;
-            vocab->token_to_id.emplace(tok, i);
+            vocab->id_to_token[i].text  = llama_token_get_text(lctx, i);
+            vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
+            vocab->id_to_token[i].type  = llama_token_get_type(lctx, i);
+            vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
         }
         llama_free(lctx);
         llama_free_model(lmodel);
     } else { // assume llama2.c vocabulary
         printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
         llama_file file(filename, "rb");
-        uint32_t n_vocab = config->vocab_size;
+        const int  n_vocab = config->vocab_size;
         /* uint32_t max_token_length =  */ file.read_u32(); // unused
         vocab->id_to_token.resize(n_vocab);
-        for (uint32_t i=0; i<n_vocab; ++i) {
+        for (int i=0; i<n_vocab; ++i) {
             float_t score = file.read_f32();
             uint32_t len = file.read_u32();
-            std::string tok = file.read_string(len);
-            vocab->id_to_token[i].tok = tok;
+            std::string text = file.read_string(len);
+            vocab->id_to_token[i].text = text;
             vocab->id_to_token[i].score = score;
-            vocab->token_to_id.emplace(tok, i);
+            vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
+            vocab->token_to_id.emplace(text, i);
         }
     }
 }
@@ -611,10 +607,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
 //    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
 //    uint32_t n_vocab = model->hparams.n_vocab;
 //    for (uint32_t i = 0; i < n_vocab; i++) {
-//        const auto & token_score = vocab->id_to_token.at(i);
-//        file.write_u32((uint32_t) token_score.tok.size());
-//        file.write_raw(token_score.tok.data(), token_score.tok.size());
-//        file.write_raw(&token_score.score, sizeof(token_score.score));
+//        const auto & token_data = vocab->id_to_token.at(i);
+//        file.write_u32((uint32_t) token_data.tok.size());
+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
+//        file.write_raw(&token_data.score, sizeof(token_data.score));
 //    }
 //
 //    // stuff AK weights into GG weights one by one.
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 922518da4..31d6620a2 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -170,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
 struct llama_vocab {
     using id    = int32_t;
     using token = std::string;
+    using ttype = llama_token_type;
 
-    struct token_score {
-        token tok;
+    struct token_data {
+        token text;
         float score;
+        ttype type;
     };
 
     std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
+    std::vector<token_data> id_to_token;
 };
 
 struct my_llama_hparams {
@@ -2629,10 +2631,10 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
 //    // write_vocab
 //    uint32_t n_vocab = model->hparams.n_vocab;
 //    for (uint32_t i = 0; i < n_vocab; i++) {
-//        const auto & token_score = vocab->id_to_token.at(i);
-//        file.write_u32((uint32_t) token_score.tok.size());
-//        file.write_raw(token_score.tok.data(), token_score.tok.size());
-//        file.write_raw(&token_score.score, sizeof(token_score.score));
+//        const auto & token_data = vocab->id_to_token.at(i);
+//        file.write_u32((uint32_t) token_data.tok.size());
+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
+//        file.write_raw(&token_data.score, sizeof(token_data.score));
 //    }
 //    // write tensors
 //    write_tensor(&file, model->tok_embeddings);
@@ -3055,20 +3057,13 @@ int main(int argc, char ** argv) {
 
     struct llama_vocab vocab;
     {
-        std::vector<const char *> strings;
-        std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);
-        strings.resize(n_vocab, NULL);
-        scores.resize(n_vocab, 0);
-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        const int n_vocab = llama_n_vocab(lctx);
         vocab.id_to_token.resize(n_vocab);
         for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
-            float       score = scores[i];
-            vocab.id_to_token[i].tok   = tok;
-            vocab.id_to_token[i].score = score;
-            vocab.token_to_id.emplace(tok, i);
+            vocab.id_to_token[i].text  = llama_token_get_text(lctx, i);
+            vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
+            vocab.id_to_token[i].type  = llama_token_get_type(lctx, i);
+            vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
         }
     }
 
diff --git a/gguf.py b/gguf.py
index 60ee52f09..9776649c7 100644
--- a/gguf.py
+++ b/gguf.py
@@ -61,6 +61,7 @@ KEY_TOKENIZER_PAD_ID     = "tokenizer.ggml.padding_token_id"
 KEY_TOKENIZER_HF_JSON    = "tokenizer.huggingface.json"
 KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"
 
+
 #
 # recommended mapping of model tensor names for storage in gguf
 #
@@ -319,6 +320,15 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
 
     return tensor_map
 
+
+class TokenType(IntEnum):
+    NORMAL       = 1
+    UNKNOWN      = 2
+    CONTROL      = 3
+    USER_DEFINED = 4
+    UNUSED       = 5
+    BYTE         = 6
+
 #
 # implementation
 #
diff --git a/llama.cpp b/llama.cpp
index 1785025f0..c97aaee69 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -771,11 +771,12 @@ struct llama_vocab {
 
     using id    = int32_t;
     using token = std::string;
+    using ttype = llama_token_type;
 
     struct token_data {
-        token tok;
+        token text;
         float score;
-        int toktype;
+        ttype type;
     };
 
     llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@@ -1521,12 +1522,12 @@ static void llama_model_load_internal(
             vocab.token_to_id[word] = i;
 
             auto & token_data = vocab.id_to_token[i];
-            token_data.tok = std::move(word);
+            token_data.text  = std::move(word);
             token_data.score = scores[i];
-            token_data.toktype = toktypes[i];
+            token_data.type  = (llama_token_type) toktypes[i];
 
             // determine the newline token: 0x0A == 10 == '\n'
-            if (token_data.tok == "<0x0A>") {
+            if (token_data.text == "<0x0A>") {
                 vocab.linefeed_id = i;
             }
         }
@@ -1558,12 +1559,12 @@ static void llama_model_load_internal(
         LLAMA_LOG_INFO("%s: general.name = %s\n",    __func__, general_name.c_str());
 
         // special tokens
-        if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].tok.c_str() ); }
-        if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].tok.c_str() ); }
-        if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].tok.c_str() ); }
-        if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].tok.c_str() ); }
-        if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].tok.c_str() ); }
-        if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token  = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].tok.c_str() );    }
+        if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
+        if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
+        if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
+        if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
+        if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
+        if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token  = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
     }
 
     if (vocab_only) {
@@ -2355,15 +2356,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
 }
 
 static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 1;
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
 }
 
 static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 2;
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
 }
 
 static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 3;
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
+}
+
+static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
+}
+
+static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
+}
+
+static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
+    return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
 }
 
 static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
@@ -2381,22 +2394,10 @@ static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
     return id == vocab.special_pad_id;
 }
 
-static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 4;
-}
-
-static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 5;
-}
-
-static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
-    return vocab.id_to_token[id].toktype == 6;
-}
-
 static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
     GGML_ASSERT(llama_is_byte_token(vocab, id));
     const auto& token_data = vocab.id_to_token.at(id);
-    auto buf = token_data.tok.substr(3, 2);
+    auto buf = token_data.text.substr(3, 2);
     return strtol(buf.c_str(), NULL, 16);
 }
 
@@ -2709,6 +2710,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
 
     bool found            = false;
     bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
+
     GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
 
     do {
@@ -4957,25 +4959,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
     return ctx->embedding.data();
 }
 
-int llama_get_vocab(
-        const struct llama_context * ctx,
-        const char * * strings,
-        float  * scores,
-        int capacity) {
-    return llama_model_get_vocab(&ctx->model, strings, scores, capacity);
+const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
+    return ctx->model.vocab.id_to_token[token].text.c_str();
 }
 
-int llama_model_get_vocab(
-        const struct llama_model * model,
-        const char * * strings,
-        float  * scores,
-        int capacity) {
-    int n = std::min(capacity, (int) model->vocab.id_to_token.size());
-    for (int i = 0; i<n; ++i) {
-        strings[i] = model->vocab.id_to_token[i].tok.c_str();
-        scores[i]  = model->vocab.id_to_token[i].score;
-    }
-    return n;
+float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
+    return ctx->model.vocab.id_to_token[token].score;
+}
+
+llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
+    return ctx->model.vocab.id_to_token[token].type;
 }
 
 llama_token llama_token_bos(const struct llama_context * ctx) {
@@ -5046,7 +5039,7 @@ int llama_token_to_str(const struct llama_context * ctx, llama_token token, char
 
 int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) {
     if (0 <= token && token < llama_model_n_vocab(&ctx->model)) {
-        std::string result = ctx->model.vocab.id_to_token[token].tok;
+        std::string result = ctx->model.vocab.id_to_token[token].text;
         if (length < (int) result.length()) {
             return -result.length();
         }
@@ -5060,7 +5053,7 @@ int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token,
 int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
     if (0 <= token && token < llama_model_n_vocab(model)) {
         if (llama_is_normal_token(model->vocab, token)) {
-            std::string result = model->vocab.id_to_token[token].tok;
+            std::string result = model->vocab.id_to_token[token].text;
             if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
                 result = llama_unescape_whitespace(result);
             }
diff --git a/llama.h b/llama.h
index 0ea65c1b5..aa5b7d69c 100644
--- a/llama.h
+++ b/llama.h
@@ -72,6 +72,16 @@ extern "C" {
         LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
     };
 
+    enum llama_token_type {
+        LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
+        LLAMA_TOKEN_TYPE_NORMAL       = 1,
+        LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
+        LLAMA_TOKEN_TYPE_CONTROL      = 3,
+        LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
+        LLAMA_TOKEN_TYPE_UNUSED       = 5,
+        LLAMA_TOKEN_TYPE_BYTE         = 6,
+    };
+
     // model file types
     enum llama_ftype {
         LLAMA_FTYPE_ALL_F32              = 0,
@@ -330,19 +340,11 @@ extern "C" {
     // Vocab
     //
 
-    // Get the vocabulary as output parameters.
-    // Returns number of results.
-    LLAMA_API int llama_get_vocab(
-            const struct llama_context * ctx,
-                          const char * * strings,
-                                 float * scores,
-                                   int   capacity);
+    LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
 
-    LLAMA_API int llama_model_get_vocab(
-              const struct llama_model * model,
-                          const char * * strings,
-                                 float * scores,
-                                   int   capacity);
+    LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
+
+    LLAMA_API llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
 
     // Special tokens
     LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence