integrated q5 formats

2023-04-28 12:58:39 +08:00 · 2023-04-28 12:58:39 +08:00 · 032a171867
commit 032a171867
parent e8a389f85b
13 changed files with 184 additions and 129 deletions
--- a/5
+++ b/5
@ -1,5 +1,6 @@
 default: koboldcpp koboldcpp_noavx2 koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast
 simple: koboldcpp koboldcpp_noavx2
+tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox
 dev: koboldcpp_openblas


@ -45,8 +46,8 @@ endif
 #

 # keep standard at C11 and C++11
-CFLAGS   = -I.              -I./include -I./include/CL -Ofast -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -I./include -I./include/CL -Ofast -DNDEBUG -std=c++11 -fPIC
+CFLAGS   = -I.              -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11   -fPIC
+CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =

 # these are used on windows, to build some libraries with extra old device compatibility
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -9,19 +9,21 @@

 #include <time.h>
 #include "model_adapter.h"
-#include "otherarch/otherarch.h"
+#include "otherarch.h"

 //for easier compilation
 #include "llamaextra.cpp"

 //concat source files into one file for compilation purposes
-#include "otherarch/utils.cpp"
-#include "otherarch/gptj_v1.cpp"
-#include "otherarch/gptj_v2.cpp"
-#include "otherarch/gpt2_v1.cpp"
-#include "otherarch/gpt2_v2.cpp"
-#include "otherarch/rwkv.cpp"
-#include "otherarch/neox.cpp"
+#include "common-ggml.cpp"
+#include "utils.cpp"
+#include "gptj_v1.cpp"
+#include "gptj_v2.cpp"
+#include "gpt2_v1.cpp"
+#include "gpt2_v2.cpp"
+#include "rwkv.cpp"
+#include "neox.cpp"
+

 //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
 static FileFormat file_format = FileFormat::BADFORMAT;
--- a/otherarch/gpt2_v1.cpp
+++ b/otherarch/gpt2_v1.cpp
@ -48,7 +48,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,     sizeof(hparams.ftype));

        //used to expand KV size if needed
        desiredMaxCtx = std::max(hparams.n_ctx,desiredMaxCtx);
@ -58,7 +58,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: f16     = %d\n", __func__, hparams.ftype);
    }

    // load vocab
@ -87,7 +87,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model

    // for the big tensors, we have the option to store the data in 16-bit floats
    // in order to save memory and also to speed up the computation
-    const ggml_v1_type wtype = model.hparams.f16 ? GGML_V1_TYPE_F16 : GGML_V1_TYPE_F32;
+    const ggml_v1_type wtype = GGML_V1_TYPE_F16;

    auto & ctx = model.ctx;

--- a/otherarch/gpt2_v2.cpp
+++ b/otherarch/gpt2_v2.cpp
@ -2,6 +2,7 @@
 #include "otherarch.h"

 #include "utils.h"
+#include "common-ggml.h"

 #include <cassert>
 #include <cmath>
@ -47,14 +48,14 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));

        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
    }

    // load vocab
@ -85,24 +86,13 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g

    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
-    ggml_type wtype = GGML_TYPE_COUNT;
-    switch (model.hparams.f16) {
-        case 0: wtype = GGML_TYPE_F32;  break;
-        case 1: wtype = GGML_TYPE_F16;  break;
-        case 2: wtype = GGML_TYPE_Q4_0; break;
-        case 3: wtype = GGML_TYPE_Q4_1; break;
-        case 5: wtype = GGML_TYPE_Q4_2; break;
-        case 6: wtype = GGML_TYPE_Q4_3; break;
-        default:
-                {
-                    fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
-                            __func__, fname.c_str(), model.hparams.f16);
-                    return ModelLoadResult::FAIL;
-                }
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return ModelLoadResult::FAIL;
    }

-    const ggml_type wtype2 = GGML_TYPE_F32;
-
    auto & ctx = model.ctx;

    size_t ctx_size = 0;
--- a/otherarch/gptj_v1.cpp
+++ b/otherarch/gptj_v1.cpp
@ -48,7 +48,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,     sizeof(hparams.ftype));

        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
@ -56,7 +56,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: f16     = %d\n", __func__, hparams.ftype);
    }

    // load vocab
@ -86,7 +86,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
    ggml_v1_type wtype = GGML_V1_TYPE_COUNT;
-    switch (model.hparams.f16) {
+    switch (model.hparams.ftype) {
        case 0: wtype = GGML_V1_TYPE_F32;  break;
        case 1: wtype = GGML_V1_TYPE_F16;  break;
        case 2: wtype = GGML_V1_TYPE_Q4_0; break;
@ -94,7 +94,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
        default:
                {
                    fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
-                            __func__, fname.c_str(), model.hparams.f16);
+                            __func__, fname.c_str(), model.hparams.ftype);
                    return ModelLoadResult::FAIL;
                }
    }
--- a/otherarch/gptj_v2.cpp
+++ b/otherarch/gptj_v2.cpp
@ -2,6 +2,7 @@
 #include "otherarch.h"

 #include "utils.h"
+#include "common-ggml.h"

 #include <cassert>
 #include <cmath>
@ -48,7 +49,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,     sizeof(hparams.ftype));

        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
@ -56,7 +57,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
    }

    // load vocab
@ -85,24 +86,13 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g

    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
-    ggml_type wtype = GGML_TYPE_COUNT;
-    switch (model.hparams.f16) {
-        case 0: wtype = GGML_TYPE_F32;  break;
-        case 1: wtype = GGML_TYPE_F16;  break;
-        case 2: wtype = GGML_TYPE_Q4_0; break;
-        case 3: wtype = GGML_TYPE_Q4_1; break;
-        case 5: wtype = GGML_TYPE_Q4_2; break;
-        case 6: wtype = GGML_TYPE_Q4_3; break;
-        default:
-                {
-                    fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
-                            __func__, fname.c_str(), model.hparams.f16);
-                    return ModelLoadResult::FAIL;
-                }
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return ModelLoadResult::FAIL;
    }

-    const ggml_type wtype2 = GGML_TYPE_F32;
-
    auto & ctx = model.ctx;

    auto memory_type = GGML_TYPE_F16;
--- a/otherarch/neox.cpp
+++ b/otherarch/neox.cpp
@ -2,6 +2,7 @@
 #include "otherarch.h"

 #include "utils.h"
+#include "common-ggml.h"

 #include <cassert>
 #include <cmath>
@ -76,24 +77,13 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model &

    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
-    ggml_type wtype = GGML_TYPE_COUNT;
-    switch (model.hparams.ftype) {
-        case 0: wtype = GGML_TYPE_F32;  break;
-        case 1: wtype = GGML_TYPE_F16;  break;
-        case 2: wtype = GGML_TYPE_Q4_0; break;
-        case 3: wtype = GGML_TYPE_Q4_1; break;
-        case 5: wtype = GGML_TYPE_Q4_2; break;
-        case 6: wtype = GGML_TYPE_Q4_3; break;
-        default:
-                {
-                    fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                            __func__, fname.c_str(), model.hparams.ftype);
-                    return ModelLoadResult::FAIL;
-                }
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return ModelLoadResult::FAIL;
    }

-    const ggml_type wtype2 = GGML_TYPE_F32;
-
    auto & ctx = model.ctx;

    size_t ctx_size = 0;
--- a/otherarch/otherarch.h
+++ b/otherarch/otherarch.h
@ -23,7 +23,7 @@ struct gptj_hparams {
    int32_t n_head  = 16;
    int32_t n_layer = 28;
    int32_t n_rot   = 64;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
 };

 struct gptj_layer {
@ -120,7 +120,7 @@ struct gpt2_hparams {
    int32_t n_embd  = 768;
    int32_t n_head  = 12;
    int32_t n_layer = 12;
-    int32_t f16     = 1;
+    int32_t ftype     = 1;
 };

 struct gpt2_v1_layer {
--- a/otherarch/tools/common-ggml.cpp
+++ b/otherarch/tools/common-ggml.cpp
@ -1,26 +1,86 @@
-#include "otherarch/tools/common-ggml.h"
-
-#include "ggml.h"
+#include "common-ggml.h"

 #include <regex>

+static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
+    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
+    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
+    {"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
+    {"q4_3", GGML_FTYPE_MOSTLY_Q4_3},
+    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
+    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
+    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
+};
+
+void ggml_print_ftypes(FILE * fp) {
+    for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
+        fprintf(fp, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
+    }
+}
+
+enum ggml_ftype ggml_parse_ftype(const char * str) {
+    enum ggml_ftype ftype;
+    if (str[0] == 'q') {
+        const auto it = GGML_FTYPE_MAP.find(str);
+        if (it == GGML_FTYPE_MAP.end()) {
+            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
+            return GGML_FTYPE_UNKNOWN;
+        }
+        ftype = it->second;
+    } else {
+        ftype = (enum ggml_ftype) atoi(str);
+    }
+
+    return ftype;
+}
+
+enum ggml_type ggml_ftype_to_ggml_type(const enum ggml_ftype ftype) {
+    ggml_type wtype = GGML_TYPE_COUNT;
+
+    switch (ftype) {
+        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
+        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
+        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
+        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
+        case GGML_FTYPE_MOSTLY_Q4_2:          wtype = GGML_TYPE_Q4_2;  break;
+        case GGML_FTYPE_MOSTLY_Q4_3:          wtype = GGML_TYPE_Q4_3;  break;
+        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
+        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
+        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
+        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
+        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
+    }
+
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
+    }
+
+    return wtype;
+}
+
 bool ggml_common_quantize_0(
        std::ifstream & finp,
        std::ofstream & fout,
-        const ggml_mtype mtype,
+        const ggml_ftype ftype,
        const std::vector<std::string> & to_quant,
        const std::vector<std::string> & to_skip) {

    ggml_type qtype = GGML_TYPE_F32;

-    switch (mtype) {
-        case 2: qtype = GGML_TYPE_Q4_0; break;
-        case 3: qtype = GGML_TYPE_Q4_1; break;
-        case 5: qtype = GGML_TYPE_Q4_2; break;
-        case 6: qtype = GGML_TYPE_Q4_3; break;
-        default:
+    switch (ftype) {
+        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
+        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
+        case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
+        case GGML_FTYPE_MOSTLY_Q4_3: qtype = GGML_TYPE_Q4_3; break;
+        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
+        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
+        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
+        case GGML_FTYPE_UNKNOWN:
+        case GGML_FTYPE_ALL_F32:
+        case GGML_FTYPE_MOSTLY_F16:
+        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
                {
-                    fprintf(stderr, "%s: invalid model type %d\n", __func__, mtype);
+                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
                }
    };
@ -127,7 +187,7 @@ bool ggml_common_quantize_0(
            size_t cur_size = 0;
            std::vector<int64_t> hist_cur(1 << 4, 0);

-            switch (ttype) {
+            switch ((ggml_type) ttype) {
                case GGML_TYPE_Q4_0:
                    {
                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
@ -144,7 +204,25 @@ bool ggml_common_quantize_0(
                    {
                        cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
-                default:
+                case GGML_TYPE_Q5_0:
+                    {
+                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_1:
+                    {
+                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q8_0:
+                    {
+                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_I8:
+                case GGML_TYPE_I16:
+                case GGML_TYPE_I32:
+                case GGML_TYPE_Q8_1:
+                case GGML_TYPE_COUNT:
                    {
                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
                        return false;
@ -173,7 +251,7 @@ bool ggml_common_quantize_0(
    }

    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
-    printf("%s: quant size  = %8.2f MB | mtype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, mtype, ggml_type_name(qtype));
+    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));

    {
        int64_t sum_all = 0;
--- a/otherarch/tools/common-ggml.h
+++ b/otherarch/tools/common-ggml.h
@ -1,23 +1,37 @@
 #pragma once

+#include "ggml.h"
+
+#include <map>
 #include <fstream>
 #include <vector>
 #include <string>

 // model file types
-enum ggml_mtype {
-    GGML_MTYPE_ALL_F32     = 0,
-    GGML_MTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-    GGML_MTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-    GGML_MTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
-    GGML_MTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-    GGML_MTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
-    GGML_MTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
+enum ggml_ftype {
+    GGML_FTYPE_UNKNOWN     = -1,
+    GGML_FTYPE_ALL_F32     = 0,
+    GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+    GGML_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+    GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
 };

+void ggml_print_ftypes(FILE * fp = stderr);
+
+enum ggml_ftype ggml_parse_ftype(const char * str);
+
+// TODO: temporary
+enum ggml_type ggml_ftype_to_ggml_type(const enum ggml_ftype ftype);
+
 bool ggml_common_quantize_0(
        std::ifstream & finp,
        std::ofstream & fout,
-        const ggml_mtype mtype,
+        const ggml_ftype ftype,
        const std::vector<std::string> & to_quant,
        const std::vector<std::string> & to_skip);
--- a/otherarch/tools/gpt2_quantize.cpp
+++ b/otherarch/tools/gpt2_quantize.cpp
@ -1,6 +1,4 @@
-#include "ggml.h"
-
-#include "otherarch/utils.h"
+#include "utils.h"
 #include "common-ggml.h"

 #include <cassert>
@ -20,11 +18,11 @@ struct gpt2_hparams {
    int32_t n_embd  = 768;
    int32_t n_head  = 12;
    int32_t n_layer = 12;
-    int32_t f16     = 1;
+    int32_t ftype     = 1;
 };

 // quantize a model
-bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
+bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
    gpt_vocab vocab;

    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -62,21 +60,21 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        finp.read((char *) &hparams.ftype,     sizeof(hparams.ftype));

        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: f16     = %d\n", __func__, hparams.ftype);

        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &mtype,           sizeof(hparams.f16));
+        fout.write((char *) &ftype,           sizeof(hparams.ftype));
    }

    // load vocab
@ -116,7 +114,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
        "model/h.*/mlp/c_proj/w",
    };

-    if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
        return false;
    }
@ -134,10 +132,7 @@ int main(int argc, char ** argv) {
    ggml_time_init();
    if (argc != 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        fprintf(stderr, "  type = 2 - q4_0\n");
-        fprintf(stderr, "  type = 3 - q4_1\n");
-        fprintf(stderr, "  type = 5 - q4_2\n");
-        fprintf(stderr, "  type = 6 - q4_3\n");
+        ggml_print_ftypes(stderr);
        return 1;
    }

@ -151,7 +146,7 @@ int main(int argc, char ** argv) {
    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];

-    const int mtype = atoi(argv[3]);
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);

    const int64_t t_main_start_us = ggml_time_us();

@ -161,7 +156,7 @@ int main(int argc, char ** argv) {
    {
        const int64_t t_start_us = ggml_time_us();

-        if (!gpt2_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
+        if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }
--- a/otherarch/tools/gptj_quantize.cpp
+++ b/otherarch/tools/gptj_quantize.cpp
@ -1,6 +1,6 @@
 #include "ggml.h"

-#include "otherarch/utils.h"
+#include "utils.h"
 #include "common-ggml.h"

 #include <cassert>
@ -25,7 +25,7 @@ struct gptj_hparams {
 };

 // quantize a model
-bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
+bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
    gpt_vocab vocab;

    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -79,7 +79,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
        fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &mtype,           sizeof(hparams.f16));
+        fout.write((char *) &ftype,           sizeof(hparams.f16));
    }

    // load vocab
@ -114,7 +114,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
        ".*weight",
    };

-    if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
        return false;
    }
@ -132,10 +132,7 @@ int main(int argc, char ** argv) {
    ggml_time_init();
    if (argc != 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        fprintf(stderr, "  type = 2 - q4_0\n");
-        fprintf(stderr, "  type = 3 - q4_1\n");
-        fprintf(stderr, "  type = 5 - q4_2\n");
-        fprintf(stderr, "  type = 6 - q4_3\n");
+        ggml_print_ftypes(stderr);
        return 1;
    }

@ -149,7 +146,7 @@ int main(int argc, char ** argv) {
    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];

-    const int mtype = atoi(argv[3]);
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);

    const int64_t t_main_start_us = ggml_time_us();

@ -159,7 +156,7 @@ int main(int argc, char ** argv) {
    {
        const int64_t t_start_us = ggml_time_us();

-        if (!gptj_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
+        if (!gptj_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }
--- a/otherarch/tools/neox_quantize.cpp
+++ b/otherarch/tools/neox_quantize.cpp
@ -1,6 +1,6 @@
 #include "ggml.h"

-#include "otherarch/utils.h"
+#include "utils.h"
 #include "common-ggml.h"

 #include <cassert>
@ -25,7 +25,7 @@ struct stablelm_hparams {
 };

 // quantize a model
-bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
+bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
    gpt_vocab vocab;

    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -79,7 +79,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
        fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &mtype,           sizeof(hparams.ftype));
+        fout.write((char *) &ftype,           sizeof(hparams.ftype));
    }

    // load vocab
@ -106,7 +106,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
        ".*weight",
    };

-    if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
+    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
        return false;
    }
@ -121,12 +121,10 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
 //  ./stablelm2-quantize models/stablelm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
 //
 int main(int argc, char ** argv) {
+    ggml_time_init();
    if (argc != 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        fprintf(stderr, "  type = 2 - q4_0\n");
-        fprintf(stderr, "  type = 3 - q4_1\n");
-        fprintf(stderr, "  type = 5 - q4_2\n");
-        fprintf(stderr, "  type = 6 - q4_3\n");
+        ggml_print_ftypes(stderr);
        return 1;
    }

@ -140,7 +138,7 @@ int main(int argc, char ** argv) {
    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];

-    const int mtype = atoi(argv[3]);
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);

    const int64_t t_main_start_us = ggml_time_us();

@ -150,7 +148,7 @@ int main(int argc, char ** argv) {
    {
        const int64_t t_start_us = ggml_time_us();

-        if (!stablelm_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
+        if (!stablelm_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }