From 032a1718678b0e3fdfe24f0443f7a9109608ad1b Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Fri, 28 Apr 2023 12:58:39 +0800
Subject: [PATCH] integrated q5 formats

---
 Makefile                          |   5 +-
 gpttype_adapter.cpp               |  18 ++---
 otherarch/gpt2_v1.cpp             |   6 +-
 otherarch/gpt2_v2.cpp             |  26 +++-----
 otherarch/gptj_v1.cpp             |   8 +--
 otherarch/gptj_v2.cpp             |  26 +++-----
 otherarch/neox.cpp                |  22 ++-----
 otherarch/otherarch.h             |   4 +-
 otherarch/tools/common-ggml.cpp   | 106 ++++++++++++++++++++++++++----
 otherarch/tools/common-ggml.h     |  32 ++++++---
 otherarch/tools/gpt2_quantize.cpp |  25 +++----
 otherarch/tools/gptj_quantize.cpp |  17 ++---
 otherarch/tools/neox_quantize.cpp |  18 +++--
 13 files changed, 184 insertions(+), 129 deletions(-)

diff --git a/Makefile b/Makefile
index 0380d6b8c..fb851996a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,6 @@
 default: koboldcpp koboldcpp_noavx2 koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast
 simple: koboldcpp koboldcpp_noavx2
+tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox
 dev: koboldcpp_openblas
 
 
@@ -45,8 +46,8 @@ endif
 #
 
 # keep standard at C11 and C++11
-CFLAGS   = -I.              -I./include -I./include/CL -Ofast -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -I./include -I./include/CL -Ofast -DNDEBUG -std=c++11 -fPIC
+CFLAGS   = -I.              -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11   -fPIC
+CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =
 
 # these are used on windows, to build some libraries with extra old device compatibility
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 7bc917ee2..11d815263 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -9,19 +9,21 @@
 
 #include <time.h>
 #include "model_adapter.h"
-#include "otherarch/otherarch.h"
+#include "otherarch.h"
 
 //for easier compilation
 #include "llamaextra.cpp"
 
 //concat source files into one file for compilation purposes
-#include "otherarch/utils.cpp"
-#include "otherarch/gptj_v1.cpp"
-#include "otherarch/gptj_v2.cpp"
-#include "otherarch/gpt2_v1.cpp"
-#include "otherarch/gpt2_v2.cpp"
-#include "otherarch/rwkv.cpp"
-#include "otherarch/neox.cpp"
+#include "common-ggml.cpp"
+#include "utils.cpp"
+#include "gptj_v1.cpp"
+#include "gptj_v2.cpp"
+#include "gpt2_v1.cpp"
+#include "gpt2_v2.cpp"
+#include "rwkv.cpp"
+#include "neox.cpp"
+
 
 //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
 static FileFormat file_format = FileFormat::BADFORMAT;
diff --git a/otherarch/gpt2_v1.cpp b/otherarch/gpt2_v1.cpp
index 1bea45b8c..e60084b34 100644
--- a/otherarch/gpt2_v1.cpp
+++ b/otherarch/gpt2_v1.cpp
@@ -48,7 +48,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
         fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
         fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
         fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,     sizeof(hparams.ftype));
 
         //used to expand KV size if needed
         desiredMaxCtx = std::max(hparams.n_ctx,desiredMaxCtx);
@@ -58,7 +58,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
         printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
         printf("%s: n_head  = %d\n", __func__, hparams.n_head);
         printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: f16     = %d\n", __func__, hparams.ftype);
     }
 
     // load vocab
@@ -87,7 +87,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
 
     // for the big tensors, we have the option to store the data in 16-bit floats
     // in order to save memory and also to speed up the computation
-    const ggml_v1_type wtype = model.hparams.f16 ? GGML_V1_TYPE_F16 : GGML_V1_TYPE_F32;
+    const ggml_v1_type wtype = GGML_V1_TYPE_F16;
 
     auto & ctx = model.ctx;
 
diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp
index db230866f..9b8c0baa5 100644
--- a/otherarch/gpt2_v2.cpp
+++ b/otherarch/gpt2_v2.cpp
@@ -2,6 +2,7 @@
 #include "otherarch.h"
 
 #include "utils.h"
+#include "common-ggml.h"
 
 #include <cassert>
 #include <cmath>
@@ -47,14 +48,14 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
         fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
         fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
         fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
 
         printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
         printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
         printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
         printf("%s: n_head  = %d\n", __func__, hparams.n_head);
         printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
     }
 
     // load vocab
@@ -85,24 +86,13 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
 
     // for the big tensors, we have the option to store the data in 16-bit floats or quantized
     // in order to save memory and also to speed up the computation
-    ggml_type wtype = GGML_TYPE_COUNT;
-    switch (model.hparams.f16) {
-        case 0: wtype = GGML_TYPE_F32;  break;
-        case 1: wtype = GGML_TYPE_F16;  break;
-        case 2: wtype = GGML_TYPE_Q4_0; break;
-        case 3: wtype = GGML_TYPE_Q4_1; break;
-        case 5: wtype = GGML_TYPE_Q4_2; break;
-        case 6: wtype = GGML_TYPE_Q4_3; break;
-        default:
-                {
-                    fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
-                            __func__, fname.c_str(), model.hparams.f16);
-                    return ModelLoadResult::FAIL;
-                }
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return ModelLoadResult::FAIL;
     }
 
-    const ggml_type wtype2 = GGML_TYPE_F32;
-
     auto & ctx = model.ctx;
 
     size_t ctx_size = 0;
diff --git a/otherarch/gptj_v1.cpp b/otherarch/gptj_v1.cpp
index 6d3530c69..2f6ae9898 100644
--- a/otherarch/gptj_v1.cpp
+++ b/otherarch/gptj_v1.cpp
@@ -48,7 +48,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
         fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
         fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
         fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,     sizeof(hparams.ftype));
 
         printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
         printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
@@ -56,7 +56,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
         printf("%s: n_head  = %d\n", __func__, hparams.n_head);
         printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
         printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: f16     = %d\n", __func__, hparams.ftype);
     }
 
     // load vocab
@@ -86,7 +86,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
     // for the big tensors, we have the option to store the data in 16-bit floats or quantized
     // in order to save memory and also to speed up the computation
     ggml_v1_type wtype = GGML_V1_TYPE_COUNT;
-    switch (model.hparams.f16) {
+    switch (model.hparams.ftype) {
         case 0: wtype = GGML_V1_TYPE_F32;  break;
         case 1: wtype = GGML_V1_TYPE_F16;  break;
         case 2: wtype = GGML_V1_TYPE_Q4_0; break;
@@ -94,7 +94,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
         default:
                 {
                     fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
-                            __func__, fname.c_str(), model.hparams.f16);
+                            __func__, fname.c_str(), model.hparams.ftype);
                     return ModelLoadResult::FAIL;
                 }
     }
diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp
index 0f8bd8815..11c53d141 100644
--- a/otherarch/gptj_v2.cpp
+++ b/otherarch/gptj_v2.cpp
@@ -2,6 +2,7 @@
 #include "otherarch.h"
 
 #include "utils.h"
+#include "common-ggml.h"
 
 #include <cassert>
 #include <cmath>
@@ -48,7 +49,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
         fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
         fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
         fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,     sizeof(hparams.ftype));
 
         printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
         printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
@@ -56,7 +57,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
         printf("%s: n_head  = %d\n", __func__, hparams.n_head);
         printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
         printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
     }
 
     // load vocab
@@ -85,24 +86,13 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
 
     // for the big tensors, we have the option to store the data in 16-bit floats or quantized
     // in order to save memory and also to speed up the computation
-    ggml_type wtype = GGML_TYPE_COUNT;
-    switch (model.hparams.f16) {
-        case 0: wtype = GGML_TYPE_F32;  break;
-        case 1: wtype = GGML_TYPE_F16;  break;
-        case 2: wtype = GGML_TYPE_Q4_0; break;
-        case 3: wtype = GGML_TYPE_Q4_1; break;
-        case 5: wtype = GGML_TYPE_Q4_2; break;
-        case 6: wtype = GGML_TYPE_Q4_3; break;
-        default:
-                {
-                    fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
-                            __func__, fname.c_str(), model.hparams.f16);
-                    return ModelLoadResult::FAIL;
-                }
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return ModelLoadResult::FAIL;
     }
 
-    const ggml_type wtype2 = GGML_TYPE_F32;
-
     auto & ctx = model.ctx;
 
     auto memory_type = GGML_TYPE_F16;
diff --git a/otherarch/neox.cpp b/otherarch/neox.cpp
index 519bc34a7..32e5b1463 100644
--- a/otherarch/neox.cpp
+++ b/otherarch/neox.cpp
@@ -2,6 +2,7 @@
 #include "otherarch.h"
 
 #include "utils.h"
+#include "common-ggml.h"
 
 #include <cassert>
 #include <cmath>
@@ -76,24 +77,13 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model &
 
     // for the big tensors, we have the option to store the data in 16-bit floats or quantized
     // in order to save memory and also to speed up the computation
-    ggml_type wtype = GGML_TYPE_COUNT;
-    switch (model.hparams.ftype) {
-        case 0: wtype = GGML_TYPE_F32;  break;
-        case 1: wtype = GGML_TYPE_F16;  break;
-        case 2: wtype = GGML_TYPE_Q4_0; break;
-        case 3: wtype = GGML_TYPE_Q4_1; break;
-        case 5: wtype = GGML_TYPE_Q4_2; break;
-        case 6: wtype = GGML_TYPE_Q4_3; break;
-        default:
-                {
-                    fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                            __func__, fname.c_str(), model.hparams.ftype);
-                    return ModelLoadResult::FAIL;
-                }
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return ModelLoadResult::FAIL;
     }
 
-    const ggml_type wtype2 = GGML_TYPE_F32;
-
     auto & ctx = model.ctx;
 
     size_t ctx_size = 0;
diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h
index b8a1951e5..3713be7ce 100644
--- a/otherarch/otherarch.h
+++ b/otherarch/otherarch.h
@@ -23,7 +23,7 @@ struct gptj_hparams {
     int32_t n_head  = 16;
     int32_t n_layer = 28;
     int32_t n_rot   = 64;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
 };
 
 struct gptj_layer {
@@ -120,7 +120,7 @@ struct gpt2_hparams {
     int32_t n_embd  = 768;
     int32_t n_head  = 12;
     int32_t n_layer = 12;
-    int32_t f16     = 1;
+    int32_t ftype     = 1;
 };
 
 struct gpt2_v1_layer {
diff --git a/otherarch/tools/common-ggml.cpp b/otherarch/tools/common-ggml.cpp
index 71406ced7..b46883610 100644
--- a/otherarch/tools/common-ggml.cpp
+++ b/otherarch/tools/common-ggml.cpp
@@ -1,26 +1,86 @@
-#include "otherarch/tools/common-ggml.h"
-
-#include "ggml.h"
+#include "common-ggml.h"
 
 #include <regex>
 
+static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
+    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
+    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
+    {"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
+    {"q4_3", GGML_FTYPE_MOSTLY_Q4_3},
+    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
+    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
+    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
+};
+
+void ggml_print_ftypes(FILE * fp) {
+    for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
+        fprintf(fp, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
+    }
+}
+
+enum ggml_ftype ggml_parse_ftype(const char * str) {
+    enum ggml_ftype ftype;
+    if (str[0] == 'q') {
+        const auto it = GGML_FTYPE_MAP.find(str);
+        if (it == GGML_FTYPE_MAP.end()) {
+            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
+            return GGML_FTYPE_UNKNOWN;
+        }
+        ftype = it->second;
+    } else {
+        ftype = (enum ggml_ftype) atoi(str);
+    }
+
+    return ftype;
+}
+
+enum ggml_type ggml_ftype_to_ggml_type(const enum ggml_ftype ftype) {
+    ggml_type wtype = GGML_TYPE_COUNT;
+
+    switch (ftype) {
+        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
+        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
+        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
+        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
+        case GGML_FTYPE_MOSTLY_Q4_2:          wtype = GGML_TYPE_Q4_2;  break;
+        case GGML_FTYPE_MOSTLY_Q4_3:          wtype = GGML_TYPE_Q4_3;  break;
+        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
+        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
+        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
+        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
+        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
+    }
+
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
+    }
+
+    return wtype;
+}
+
 bool ggml_common_quantize_0(
         std::ifstream & finp,
         std::ofstream & fout,
-        const ggml_mtype mtype,
+        const ggml_ftype ftype,
         const std::vector<std::string> & to_quant,
         const std::vector<std::string> & to_skip) {
 
     ggml_type qtype = GGML_TYPE_F32;
 
-    switch (mtype) {
-        case 2: qtype = GGML_TYPE_Q4_0; break;
-        case 3: qtype = GGML_TYPE_Q4_1; break;
-        case 5: qtype = GGML_TYPE_Q4_2; break;
-        case 6: qtype = GGML_TYPE_Q4_3; break;
-        default:
+    switch (ftype) {
+        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
+        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
+        case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
+        case GGML_FTYPE_MOSTLY_Q4_3: qtype = GGML_TYPE_Q4_3; break;
+        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
+        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
+        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
+        case GGML_FTYPE_UNKNOWN:
+        case GGML_FTYPE_ALL_F32:
+        case GGML_FTYPE_MOSTLY_F16:
+        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
                 {
-                    fprintf(stderr, "%s: invalid model type %d\n", __func__, mtype);
+                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                     return false;
                 }
     };
@@ -127,7 +187,7 @@ bool ggml_common_quantize_0(
             size_t cur_size = 0;
             std::vector<int64_t> hist_cur(1 << 4, 0);
 
-            switch (ttype) {
+            switch ((ggml_type) ttype) {
                 case GGML_TYPE_Q4_0:
                     {
                         cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
@@ -144,7 +204,25 @@ bool ggml_common_quantize_0(
                     {
                         cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                     } break;
-                default:
+                case GGML_TYPE_Q5_0:
+                    {
+                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_1:
+                    {
+                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q8_0:
+                    {
+                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_I8:
+                case GGML_TYPE_I16:
+                case GGML_TYPE_I32:
+                case GGML_TYPE_Q8_1:
+                case GGML_TYPE_COUNT:
                     {
                         fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
                         return false;
@@ -173,7 +251,7 @@ bool ggml_common_quantize_0(
     }
 
     printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
-    printf("%s: quant size  = %8.2f MB | mtype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, mtype, ggml_type_name(qtype));
+    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
 
     {
         int64_t sum_all = 0;
diff --git a/otherarch/tools/common-ggml.h b/otherarch/tools/common-ggml.h
index 6299cfdb5..af57ea5d1 100644
--- a/otherarch/tools/common-ggml.h
+++ b/otherarch/tools/common-ggml.h
@@ -1,23 +1,37 @@
 #pragma once
 
+#include "ggml.h"
+
+#include <map>
 #include <fstream>
 #include <vector>
 #include <string>
 
 // model file types
-enum ggml_mtype {
-    GGML_MTYPE_ALL_F32     = 0,
-    GGML_MTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-    GGML_MTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-    GGML_MTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
-    GGML_MTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-    GGML_MTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
-    GGML_MTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
+enum ggml_ftype {
+    GGML_FTYPE_UNKNOWN     = -1,
+    GGML_FTYPE_ALL_F32     = 0,
+    GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+    GGML_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
 };
 
+void ggml_print_ftypes(FILE * fp = stderr);
+
+enum ggml_ftype ggml_parse_ftype(const char * str);
+
+// TODO: temporary
+enum ggml_type ggml_ftype_to_ggml_type(const enum ggml_ftype ftype);
+
 bool ggml_common_quantize_0(
         std::ifstream & finp,
         std::ofstream & fout,
-        const ggml_mtype mtype,
+        const ggml_ftype ftype,
         const std::vector<std::string> & to_quant,
         const std::vector<std::string> & to_skip);
\ No newline at end of file
diff --git a/otherarch/tools/gpt2_quantize.cpp b/otherarch/tools/gpt2_quantize.cpp
index 196db0cfd..7e9ad7739 100644
--- a/otherarch/tools/gpt2_quantize.cpp
+++ b/otherarch/tools/gpt2_quantize.cpp
@@ -1,6 +1,4 @@
-#include "ggml.h"
-
-#include "otherarch/utils.h"
+#include "utils.h"
 #include "common-ggml.h"
 
 #include <cassert>
@@ -20,11 +18,11 @@ struct gpt2_hparams {
     int32_t n_embd  = 768;
     int32_t n_head  = 12;
     int32_t n_layer = 12;
-    int32_t f16     = 1;
+    int32_t ftype     = 1;
 };
 
 // quantize a model
-bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
+bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
     gpt_vocab vocab;
 
     printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@@ -62,21 +60,21 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
         finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
         finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
         finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        finp.read((char *) &hparams.ftype,     sizeof(hparams.ftype));
 
         printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
         printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
         printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
         printf("%s: n_head  = %d\n", __func__, hparams.n_head);
         printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: f16     = %d\n", __func__, hparams.ftype);
 
         fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
         fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
         fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
         fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
         fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &mtype,           sizeof(hparams.f16));
+        fout.write((char *) &ftype,           sizeof(hparams.ftype));
     }
 
     // load vocab
@@ -116,7 +114,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
         "model/h.*/mlp/c_proj/w",
     };
 
-    if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
         fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
         return false;
     }
@@ -134,10 +132,7 @@ int main(int argc, char ** argv) {
     ggml_time_init();
     if (argc != 4) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        fprintf(stderr, "  type = 2 - q4_0\n");
-        fprintf(stderr, "  type = 3 - q4_1\n");
-        fprintf(stderr, "  type = 5 - q4_2\n");
-        fprintf(stderr, "  type = 6 - q4_3\n");
+        ggml_print_ftypes(stderr);
         return 1;
     }
 
@@ -151,7 +146,7 @@ int main(int argc, char ** argv) {
     const std::string fname_inp = argv[1];
     const std::string fname_out = argv[2];
 
-    const int mtype = atoi(argv[3]);
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
 
     const int64_t t_main_start_us = ggml_time_us();
 
@@ -161,7 +156,7 @@ int main(int argc, char ** argv) {
     {
         const int64_t t_start_us = ggml_time_us();
 
-        if (!gpt2_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
+        if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
             fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
             return 1;
         }
diff --git a/otherarch/tools/gptj_quantize.cpp b/otherarch/tools/gptj_quantize.cpp
index f3ca21d71..a9f4232be 100644
--- a/otherarch/tools/gptj_quantize.cpp
+++ b/otherarch/tools/gptj_quantize.cpp
@@ -1,6 +1,6 @@
 #include "ggml.h"
 
-#include "otherarch/utils.h"
+#include "utils.h"
 #include "common-ggml.h"
 
 #include <cassert>
@@ -25,7 +25,7 @@ struct gptj_hparams {
 };
 
 // quantize a model
-bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
+bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
     gpt_vocab vocab;
 
     printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@@ -79,7 +79,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
         fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
         fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
         fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &mtype,           sizeof(hparams.f16));
+        fout.write((char *) &ftype,           sizeof(hparams.f16));
     }
 
     // load vocab
@@ -114,7 +114,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
         ".*weight",
     };
 
-    if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
         fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
         return false;
     }
@@ -132,10 +132,7 @@ int main(int argc, char ** argv) {
     ggml_time_init();
     if (argc != 4) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        fprintf(stderr, "  type = 2 - q4_0\n");
-        fprintf(stderr, "  type = 3 - q4_1\n");
-        fprintf(stderr, "  type = 5 - q4_2\n");
-        fprintf(stderr, "  type = 6 - q4_3\n");
+        ggml_print_ftypes(stderr);
         return 1;
     }
 
@@ -149,7 +146,7 @@ int main(int argc, char ** argv) {
     const std::string fname_inp = argv[1];
     const std::string fname_out = argv[2];
 
-    const int mtype = atoi(argv[3]);
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
 
     const int64_t t_main_start_us = ggml_time_us();
 
@@ -159,7 +156,7 @@ int main(int argc, char ** argv) {
     {
         const int64_t t_start_us = ggml_time_us();
 
-        if (!gptj_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
+        if (!gptj_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
             fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
             return 1;
         }
diff --git a/otherarch/tools/neox_quantize.cpp b/otherarch/tools/neox_quantize.cpp
index d32282eac..b719a5dc5 100644
--- a/otherarch/tools/neox_quantize.cpp
+++ b/otherarch/tools/neox_quantize.cpp
@@ -1,6 +1,6 @@
 #include "ggml.h"
 
-#include "otherarch/utils.h"
+#include "utils.h"
 #include "common-ggml.h"
 
 #include <cassert>
@@ -25,7 +25,7 @@ struct stablelm_hparams {
 };
 
 // quantize a model
-bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
+bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
     gpt_vocab vocab;
 
     printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@@ -79,7 +79,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
         fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
         fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
         fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &mtype,           sizeof(hparams.ftype));
+        fout.write((char *) &ftype,           sizeof(hparams.ftype));
     }
 
     // load vocab
@@ -106,7 +106,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
         ".*weight",
     };
 
-    if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
         fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
         return false;
     }
@@ -121,12 +121,10 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
 //  ./stablelm2-quantize models/stablelm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
 //
 int main(int argc, char ** argv) {
+    ggml_time_init();
     if (argc != 4) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        fprintf(stderr, "  type = 2 - q4_0\n");
-        fprintf(stderr, "  type = 3 - q4_1\n");
-        fprintf(stderr, "  type = 5 - q4_2\n");
-        fprintf(stderr, "  type = 6 - q4_3\n");
+        ggml_print_ftypes(stderr);
         return 1;
     }
 
@@ -140,7 +138,7 @@ int main(int argc, char ** argv) {
     const std::string fname_inp = argv[1];
     const std::string fname_out = argv[2];
 
-    const int mtype = atoi(argv[3]);
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
 
     const int64_t t_main_start_us = ggml_time_us();
 
@@ -150,7 +148,7 @@ int main(int argc, char ** argv) {
     {
         const int64_t t_start_us = ggml_time_us();
 
-        if (!stablelm_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
+        if (!stablelm_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
             fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
             return 1;
         }