From 90fe9096b4ec1eeeed09bb84ac552d76d3d1549f Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Wed, 17 May 2023 11:23:29 +0800
Subject: [PATCH] clean and refactoring pass before supporting newer models for
 different arch

---
 gpttype_adapter.cpp               |  11 ++-
 otherarch/gpt2_v2.cpp             |   1 -
 otherarch/gptj_v2.cpp             |   1 -
 otherarch/neox.cpp                |  13 ++-
 otherarch/otherarch.h             |  10 +--
 otherarch/tools/common-ggml.cpp   |  36 +++-----
 otherarch/tools/common-ggml.h     |   4 +-
 otherarch/tools/gpt2_quantize.cpp |  24 +++--
 otherarch/tools/gptj_quantize.cpp |  24 +++--
 otherarch/tools/gptj_v1_main.cpp  | 145 ------------------------------
 otherarch/tools/gptj_v2_main.cpp  | 145 ------------------------------
 otherarch/tools/neox_quantize.cpp |  34 ++++---
 12 files changed, 81 insertions(+), 367 deletions(-)
 delete mode 100644 otherarch/tools/gptj_v1_main.cpp
 delete mode 100644 otherarch/tools/gptj_v2_main.cpp
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index f78f3aa99..dae43a685 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -15,7 +15,6 @@
 #include "llamaextra.cpp"
 
 //concat source files into one file for compilation purposes
-#include "common-ggml.cpp"
 #include "utils.cpp"
 #include "gptj_v1.cpp"
 #include "gptj_v2.cpp"
@@ -33,7 +32,7 @@ static gptj_model_v1 gptj_ctx_v1;
 static gptj_model gptj_ctx_v2;
 static gpt2_v1_model gpt2_ctx_v1;
 static gpt2_model gpt2_ctx_v2;
-static stablelm_model neox_ctx;
+static gpt_neox_model neox_ctx;
 static rwkv_context * rwkv_ctx_v1;
 static llama_context_params llama_ctx_params;
 static llama_context * llama_ctx_v1;
@@ -378,7 +377,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     }
     else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
     {
-        ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format);       
+        ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx, vocab, file_format);       
         if(res==ModelLoadResult::FAIL)
         {
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
@@ -394,13 +393,13 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5);   
 
          // determine the required inference memory per token:    
-        stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
+        gpt_neox_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
 
         if(logits.size()>0 && (file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_4) && !IsNanCheck(logits[0]))
         {
             //run the black magic eval to determine if it's redpajama. VERY UGLY HACK!
             std::vector<int> test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7");           
-            stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, (file_format==FileFormat::NEOX_2?FileFormat::NEOX_3:FileFormat::NEOX_5));          
+            gpt_neox_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, (file_format==FileFormat::NEOX_2?FileFormat::NEOX_3:FileFormat::NEOX_5));          
             int topid = std::max_element(logits.begin(),logits.end())-logits.begin();
             std::string predicted = vocab.id_to_token[topid].c_str();
             if(predicted.find("8") != std::string::npos)
@@ -695,7 +694,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
             }
             else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
             {
-                evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
+                evalres = gpt_neox_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
             }
             else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
             {
diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp
index 4da8b1921..96f4bf215 100644
--- a/otherarch/gpt2_v2.cpp
+++ b/otherarch/gpt2_v2.cpp
@@ -2,7 +2,6 @@
 #include "otherarch.h"
 
 #include "utils.h"
-#include "common-ggml.h"
 
 #include <cassert>
 #include <cmath>
diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp
index 0fc1e16c0..ccfd85897 100644
--- a/otherarch/gptj_v2.cpp
+++ b/otherarch/gptj_v2.cpp
@@ -2,7 +2,6 @@
 #include "otherarch.h"
 
 #include "utils.h"
-#include "common-ggml.h"
 
 #include <cassert>
 #include <cmath>
diff --git a/otherarch/neox.cpp b/otherarch/neox.cpp
index c37a24e43..5b21a6e5e 100644
--- a/otherarch/neox.cpp
+++ b/otherarch/neox.cpp
@@ -2,7 +2,6 @@
 #include "otherarch.h"
 
 #include "utils.h"
-#include "common-ggml.h"
 
 #include <cassert>
 #include <cmath>
@@ -17,7 +16,7 @@
 
 
 // load the model's weights from a file
-ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_vocab & vocab, FileFormat file_format) {
+ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) {
     printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     auto fin = std::ifstream(fname, std::ios::binary);
@@ -340,8 +339,8 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model &
 //   - embd_inp:  the embeddings of the tokens in the context
 //   - embd_w:    the predicted logits for the next token
 //
-bool stablelm_eval(
-        const stablelm_model & model,
+bool gpt_neox_eval(
+        const gpt_neox_model & model,
         const int n_threads,
         const int n_past,
         const std::vector<gpt_vocab::id> & embd_inp,
@@ -497,7 +496,7 @@ bool stablelm_eval(
             }
         }
 
-        if(file_format==FileFormat::NEOX_3)
+        if(file_format==FileFormat::NEOX_3||file_format==FileFormat::NEOX_5)
         {
             // layer input + Attn
             cur  = ggml_add(ctx0, cur, inpL);
@@ -511,7 +510,7 @@ bool stablelm_eval(
             // post attention layer norm
             // note here we pass inpL instead of cur
             {
-                cur = ggml_norm(ctx0, (file_format==FileFormat::NEOX_3?cur:inpL));
+                cur = ggml_norm(ctx0, ((file_format==FileFormat::NEOX_3||file_format==FileFormat::NEOX_5)?cur:inpL));
 
                 cur = ggml_add(ctx0,
                     ggml_mul(ctx0,
@@ -542,7 +541,7 @@ bool stablelm_eval(
                     cur);
         }
 
-        if (file_format == FileFormat::NEOX_3)
+        if (file_format==FileFormat::NEOX_3||file_format==FileFormat::NEOX_5)
         {
             // layer input + FF
             inpL = ggml_add(ctx0, cur, inpFF);
diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h
index f194895e3..ed5927643 100644
--- a/otherarch/otherarch.h
+++ b/otherarch/otherarch.h
@@ -214,7 +214,7 @@ struct gpt2_model {
 };
 
 // default hparams (StableLM 3B)
-struct stablelm_hparams {
+struct gpt_neox_hparams {
     int32_t n_vocab = 50257;
     int32_t n_ctx   = 4096;
     int32_t n_embd  = 4096;
@@ -224,7 +224,7 @@ struct stablelm_hparams {
     int32_t ftype   = 1;
 };
 
-struct stablelm_layer {
+struct gpt_neox_layer {
     // pre normalization
     struct ggml_tensor * ln_1_g;
     struct ggml_tensor * ln_1_b;
@@ -248,8 +248,8 @@ struct stablelm_layer {
     struct ggml_tensor * c_mlp_proj_b;
 };
 
-struct stablelm_model {
-    stablelm_hparams hparams;
+struct gpt_neox_model {
+    gpt_neox_hparams hparams;
 
     // normalization
     struct ggml_tensor * ln_f_g;
@@ -260,7 +260,7 @@ struct stablelm_model {
     struct ggml_tensor * lmh_g; // language model head
     //struct ggml_tensor * lmh_b; // language model bias
 
-    std::vector<stablelm_layer> layers;
+    std::vector<gpt_neox_layer> layers;
 
     // key + value memory
     struct ggml_tensor * memory_k;
diff --git a/otherarch/tools/common-ggml.cpp b/otherarch/tools/common-ggml.cpp
index 3c0bfe286..9843303fc 100644
--- a/otherarch/tools/common-ggml.cpp
+++ b/otherarch/tools/common-ggml.cpp
@@ -1,12 +1,11 @@
 #include "common-ggml.h"
 
 #include <regex>
+#include <map>
 
 static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
     {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
     {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
-    {"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
-    {"q4_3", GGML_FTYPE_MOSTLY_Q4_3},
     {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
     {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
     {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
@@ -46,8 +45,6 @@ bool ggml_common_quantize_0(
     switch (ftype) {
         case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
         case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
-        case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
-        case GGML_FTYPE_MOSTLY_Q4_3: qtype = GGML_TYPE_Q4_3; break;
         case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
         case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
         case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
@@ -91,7 +88,7 @@ bool ggml_common_quantize_0(
         }
 
         int32_t nelements = 1;
-        int32_t ne[2] = { 1, 1 };
+        int32_t ne[4] = { 1, 1, 1, 1 };
         for (int i = 0; i < n_dims; ++i) {
             finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
             nelements *= ne[i];
@@ -100,7 +97,7 @@ bool ggml_common_quantize_0(
         std::string name(length, 0);
         finp.read (&name[0], length);
 
-        printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ggml_type_name((ggml_type) ttype));
+        printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
 
         bool quantize = false;
 
@@ -166,31 +163,23 @@ bool ggml_common_quantize_0(
             switch ((ggml_type) ttype) {
                 case GGML_TYPE_Q4_0:
                     {
-                        cur_size = ggml_quantize_q4_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                     } break;
                 case GGML_TYPE_Q4_1:
                     {
-                        cur_size = ggml_quantize_q4_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
-                case GGML_TYPE_Q4_2:
-                    {
-                        cur_size = ggml_quantize_q4_2_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
-                case GGML_TYPE_Q4_3:
-                    {
-                        cur_size = ggml_quantize_q4_3_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                     } break;
                 case GGML_TYPE_Q5_0:
                     {
-                        cur_size = ggml_quantize_q5_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                     } break;
                 case GGML_TYPE_Q5_1:
                     {
-                        cur_size = ggml_quantize_q5_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                     } break;
                 case GGML_TYPE_Q8_0:
                     {
-                        cur_size = ggml_quantize_q8_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                     } break;
                 case GGML_TYPE_F32:
                 case GGML_TYPE_F16:
@@ -198,7 +187,6 @@ bool ggml_common_quantize_0(
                 case GGML_TYPE_I16:
                 case GGML_TYPE_I32:
                 case GGML_TYPE_Q8_1:
-                case GGML_TYPE_Q8_1B:
                 case GGML_TYPE_COUNT:
                     {
                         fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
@@ -210,11 +198,11 @@ bool ggml_common_quantize_0(
             total_size_new += cur_size;
 
             printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
-            for (int i = 0; i < hist_cur.size(); ++i) {
+            for (int i = 0; i < (int) hist_cur.size(); ++i) {
                 hist_all[i] += hist_cur[i];
             }
 
-            for (int i = 0; i < hist_cur.size(); ++i) {
+            for (int i = 0; i < (int) hist_cur.size(); ++i) {
                 printf("%5.3f ", hist_cur[i] / (float)nelements);
             }
             printf("\n");
@@ -232,12 +220,12 @@ bool ggml_common_quantize_0(
 
     {
         int64_t sum_all = 0;
-        for (int i = 0; i < hist_all.size(); ++i) {
+        for (int i = 0; i < (int) hist_all.size(); ++i) {
             sum_all += hist_all[i];
         }
 
         printf("%s: hist: ", __func__);
-        for (int i = 0; i < hist_all.size(); ++i) {
+        for (int i = 0; i < (int) hist_all.size(); ++i) {
             printf("%5.3f ", hist_all[i] / (float)sum_all);
         }
         printf("\n");
diff --git a/otherarch/tools/common-ggml.h b/otherarch/tools/common-ggml.h
index 9b2e35ad1..29ba4ad5f 100644
--- a/otherarch/tools/common-ggml.h
+++ b/otherarch/tools/common-ggml.h
@@ -2,16 +2,14 @@
 
 #include "ggml.h"
 
-#include <map>
 #include <fstream>
 #include <vector>
 #include <string>
 
+enum ggml_ftype ggml_parse_ftype(const char * str);
 
 void ggml_print_ftypes(FILE * fp = stderr);
 
-enum ggml_ftype ggml_parse_ftype(const char * str);
-
 bool ggml_common_quantize_0(
         std::ifstream & finp,
         std::ofstream & fout,
diff --git a/otherarch/tools/gpt2_quantize.cpp b/otherarch/tools/gpt2_quantize.cpp
index 7e9ad7739..a49de5899 100644
--- a/otherarch/tools/gpt2_quantize.cpp
+++ b/otherarch/tools/gpt2_quantize.cpp
@@ -18,7 +18,7 @@ struct gpt2_hparams {
     int32_t n_embd  = 768;
     int32_t n_head  = 12;
     int32_t n_layer = 12;
-    int32_t ftype     = 1;
+    int32_t ftype   = 1;
 };
 
 // quantize a model
@@ -60,21 +60,27 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
         finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
         finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
         finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.ftype,     sizeof(hparams.ftype));
+        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
 
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.ftype);
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
 
         fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
         fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
         fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
         fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
         fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &ftype,           sizeof(hparams.ftype));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
     }
 
     // load vocab
diff --git a/otherarch/tools/gptj_quantize.cpp b/otherarch/tools/gptj_quantize.cpp
index a9f4232be..5e1c695aa 100644
--- a/otherarch/tools/gptj_quantize.cpp
+++ b/otherarch/tools/gptj_quantize.cpp
@@ -21,7 +21,7 @@ struct gptj_hparams {
     int32_t n_head  = 16;
     int32_t n_layer = 28;
     int32_t n_rot   = 64;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
 };
 
 // quantize a model
@@ -64,14 +64,20 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
         finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
         finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
         finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        finp.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
 
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
 
         fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
         fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
@@ -79,7 +85,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
         fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
         fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
         fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &ftype,           sizeof(hparams.f16));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
     }
 
     // load vocab
diff --git a/otherarch/tools/gptj_v1_main.cpp b/otherarch/tools/gptj_v1_main.cpp
deleted file mode 100644
index 6cc152cb7..000000000
--- a/otherarch/tools/gptj_v1_main.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-#include "gptj_v1.cpp"
-
-int main(int argc, char ** argv) {
-    ggml_v1_time_init();
-    const int64_t t_main_start_us = ggml_v1_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-j-6B/ggml-model.bin";
-
-    if (utils_gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        if( !isatty(STDIN_FILENO) ){
-            std::string line;
-            while( std::getline(std::cin, line) ){
-                params.prompt = params.prompt + "\n" + line;
-            }
-        } else {
-            params.prompt = utils_gpt_random_prompt(rng);
-        }
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gptj_model_v1 model;
-    FileFormat file_format = FileFormat::GPTJ_2;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_v1_time_us();
-
-        if (legacy_gptj_model_load(params.model, model, vocab, file_format)!=ModelLoadResult::SUCCESS) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_v1_time_us() - t_start_us;
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    printf("\n");
-
-    std::vector<gpt_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    legacy_gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
-
-    for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_v1_time_us();
-
-            if (!legacy_gptj_eval(model, params.n_threads, n_past, embd, logits, mem_per_token,file_format)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_v1_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_v1_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_v1_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (int k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (embd.size() > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_v1_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_v1_free(model.ctx);
-
-    return 0;
-}
diff --git a/otherarch/tools/gptj_v2_main.cpp b/otherarch/tools/gptj_v2_main.cpp
deleted file mode 100644
index 87346476b..000000000
--- a/otherarch/tools/gptj_v2_main.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-#include "gptj_v2.cpp"
-
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-j-6B/ggml-model.bin";
-
-    if (utils_gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        if( !isatty(STDIN_FILENO) ){
-            std::string line;
-            while( std::getline(std::cin, line) ){
-                params.prompt = params.prompt + "\n" + line;
-            }
-        } else {
-            params.prompt = utils_gpt_random_prompt(rng);
-        }
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gptj_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (gptj_model_load(params.model, model, vocab)==ModelLoadResult::FAIL) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    printf("\n");
-
-    std::vector<gpt_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-    for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gptj_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (int k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (embd.size() > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
\ No newline at end of file
diff --git a/otherarch/tools/neox_quantize.cpp b/otherarch/tools/neox_quantize.cpp
index b719a5dc5..602c9d3c4 100644
--- a/otherarch/tools/neox_quantize.cpp
+++ b/otherarch/tools/neox_quantize.cpp
@@ -14,18 +14,19 @@
 #include <regex>
 
 // default hparams (StableLM 3B)
-struct stablelm_hparams {
+struct gpt_neox_hparams {
     int32_t n_vocab = 50257;
     int32_t n_ctx   = 4096;
     int32_t n_embd  = 4096;
     int32_t n_head  = 32;
     int32_t n_layer = 16;
     int32_t n_rot   = 32; // 0.25 * (n_embd / n_head)
+    int32_t par_res = 1; // 1 = true, 0 = false
     int32_t ftype   = 1;
 };
 
 // quantize a model
-bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
     gpt_vocab vocab;
 
     printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@@ -54,7 +55,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
         fout.write((char *) &magic, sizeof(magic));
     }
 
-    stablelm_hparams hparams;
+    gpt_neox_hparams hparams;
 
     // load hparams
     {
@@ -64,14 +65,22 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
         finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
         finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
         finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
+        finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
         finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
 
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: par_res     = %d\n", __func__, hparams.par_res);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
 
         fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
         fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
@@ -79,7 +88,8 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
         fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
         fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
         fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &ftype,           sizeof(hparams.ftype));
+        fout.write((char *) &hparams.par_res, sizeof(hparams.par_res));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
     }
 
     // load vocab
@@ -118,7 +128,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
 }
 
 // usage:
-//  ./stablelm2-quantize models/stablelm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
+//  ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
 //
 int main(int argc, char ** argv) {
     ggml_time_init();
@@ -148,7 +158,7 @@ int main(int argc, char ** argv) {
     {
         const int64_t t_start_us = ggml_time_us();
 
-        if (!stablelm_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+        if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
             fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
             return 1;
         }