From 032a1718678b0e3fdfe24f0443f7a9109608ad1b Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 28 Apr 2023 12:58:39 +0800 Subject: [PATCH] integrated q5 formats --- Makefile | 5 +- gpttype_adapter.cpp | 18 ++--- otherarch/gpt2_v1.cpp | 6 +- otherarch/gpt2_v2.cpp | 26 +++----- otherarch/gptj_v1.cpp | 8 +-- otherarch/gptj_v2.cpp | 26 +++----- otherarch/neox.cpp | 22 ++----- otherarch/otherarch.h | 4 +- otherarch/tools/common-ggml.cpp | 106 ++++++++++++++++++++++++++---- otherarch/tools/common-ggml.h | 32 ++++++--- otherarch/tools/gpt2_quantize.cpp | 25 +++---- otherarch/tools/gptj_quantize.cpp | 17 ++--- otherarch/tools/neox_quantize.cpp | 18 +++-- 13 files changed, 184 insertions(+), 129 deletions(-) diff --git a/Makefile b/Makefile index 0380d6b8c..fb851996a 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ default: koboldcpp koboldcpp_noavx2 koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast simple: koboldcpp koboldcpp_noavx2 +tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox dev: koboldcpp_openblas @@ -45,8 +46,8 @@ endif # # keep standard at C11 and C++11 -CFLAGS = -I. -I./include -I./include/CL -Ofast -DNDEBUG -std=c11 -fPIC -CXXFLAGS = -I. -I./examples -I./include -I./include/CL -Ofast -DNDEBUG -std=c++11 -fPIC +CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC +CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC LDFLAGS = # these are used on windows, to build some libraries with extra old device compatibility diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 7bc917ee2..11d815263 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -9,19 +9,21 @@ #include #include "model_adapter.h" -#include "otherarch/otherarch.h" +#include "otherarch.h" //for easier compilation #include "llamaextra.cpp" //concat source files into one file for compilation purposes -#include "otherarch/utils.cpp" -#include "otherarch/gptj_v1.cpp" -#include "otherarch/gptj_v2.cpp" -#include "otherarch/gpt2_v1.cpp" -#include "otherarch/gpt2_v2.cpp" -#include "otherarch/rwkv.cpp" -#include "otherarch/neox.cpp" +#include "common-ggml.cpp" +#include "utils.cpp" +#include "gptj_v1.cpp" +#include "gptj_v2.cpp" +#include "gpt2_v1.cpp" +#include "gpt2_v2.cpp" +#include "rwkv.cpp" +#include "neox.cpp" + //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt) static FileFormat file_format = FileFormat::BADFORMAT; diff --git a/otherarch/gpt2_v1.cpp b/otherarch/gpt2_v1.cpp index 1bea45b8c..e60084b34 100644 --- a/otherarch/gpt2_v1.cpp +++ b/otherarch/gpt2_v1.cpp @@ -48,7 +48,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.f16, sizeof(hparams.f16)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); //used to expand KV size if needed desiredMaxCtx = std::max(hparams.n_ctx,desiredMaxCtx); @@ -58,7 +58,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: f16 = %d\n", __func__, hparams.f16); + printf("%s: f16 = %d\n", __func__, hparams.ftype); } // load vocab @@ -87,7 +87,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model // for the big tensors, we have the option to store the data in 16-bit floats // in order to save memory and also to speed up the computation - const ggml_v1_type wtype = model.hparams.f16 ? GGML_V1_TYPE_F16 : GGML_V1_TYPE_F32; + const ggml_v1_type wtype = GGML_V1_TYPE_F16; auto & ctx = model.ctx; diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index db230866f..9b8c0baa5 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -2,6 +2,7 @@ #include "otherarch.h" #include "utils.h" +#include "common-ggml.h" #include #include @@ -47,14 +48,14 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.f16, sizeof(hparams.f16)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: f16 = %d\n", __func__, hparams.f16); + printf("%s: ftype = %d\n", __func__, hparams.ftype); } // load vocab @@ -85,24 +86,13 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation - ggml_type wtype = GGML_TYPE_COUNT; - switch (model.hparams.f16) { - case 0: wtype = GGML_TYPE_F32; break; - case 1: wtype = GGML_TYPE_F16; break; - case 2: wtype = GGML_TYPE_Q4_0; break; - case 3: wtype = GGML_TYPE_Q4_1; break; - case 5: wtype = GGML_TYPE_Q4_2; break; - case 6: wtype = GGML_TYPE_Q4_3; break; - default: - { - fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", - __func__, fname.c_str(), model.hparams.f16); - return ModelLoadResult::FAIL; - } + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return ModelLoadResult::FAIL; } - const ggml_type wtype2 = GGML_TYPE_F32; - auto & ctx = model.ctx; size_t ctx_size = 0; diff --git a/otherarch/gptj_v1.cpp b/otherarch/gptj_v1.cpp index 6d3530c69..2f6ae9898 100644 --- a/otherarch/gptj_v1.cpp +++ b/otherarch/gptj_v1.cpp @@ -48,7 +48,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1 fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fin.read((char *) &hparams.f16, sizeof(hparams.f16)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); @@ -56,7 +56,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1 printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); printf("%s: n_rot = %d\n", __func__, hparams.n_rot); - printf("%s: f16 = %d\n", __func__, hparams.f16); + printf("%s: f16 = %d\n", __func__, hparams.ftype); } // load vocab @@ -86,7 +86,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1 // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation ggml_v1_type wtype = GGML_V1_TYPE_COUNT; - switch (model.hparams.f16) { + switch (model.hparams.ftype) { case 0: wtype = GGML_V1_TYPE_F32; break; case 1: wtype = GGML_V1_TYPE_F16; break; case 2: wtype = GGML_V1_TYPE_Q4_0; break; @@ -94,7 +94,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1 default: { fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", - __func__, fname.c_str(), model.hparams.f16); + __func__, fname.c_str(), model.hparams.ftype); return ModelLoadResult::FAIL; } } diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index 0f8bd8815..11c53d141 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -2,6 +2,7 @@ #include "otherarch.h" #include "utils.h" +#include "common-ggml.h" #include #include @@ -48,7 +49,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fin.read((char *) &hparams.f16, sizeof(hparams.f16)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); @@ -56,7 +57,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); printf("%s: n_rot = %d\n", __func__, hparams.n_rot); - printf("%s: f16 = %d\n", __func__, hparams.f16); + printf("%s: ftype = %d\n", __func__, hparams.ftype); } // load vocab @@ -85,24 +86,13 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation - ggml_type wtype = GGML_TYPE_COUNT; - switch (model.hparams.f16) { - case 0: wtype = GGML_TYPE_F32; break; - case 1: wtype = GGML_TYPE_F16; break; - case 2: wtype = GGML_TYPE_Q4_0; break; - case 3: wtype = GGML_TYPE_Q4_1; break; - case 5: wtype = GGML_TYPE_Q4_2; break; - case 6: wtype = GGML_TYPE_Q4_3; break; - default: - { - fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", - __func__, fname.c_str(), model.hparams.f16); - return ModelLoadResult::FAIL; - } + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return ModelLoadResult::FAIL; } - const ggml_type wtype2 = GGML_TYPE_F32; - auto & ctx = model.ctx; auto memory_type = GGML_TYPE_F16; diff --git a/otherarch/neox.cpp b/otherarch/neox.cpp index 519bc34a7..32e5b1463 100644 --- a/otherarch/neox.cpp +++ b/otherarch/neox.cpp @@ -2,6 +2,7 @@ #include "otherarch.h" #include "utils.h" +#include "common-ggml.h" #include #include @@ -76,24 +77,13 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model & // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation - ggml_type wtype = GGML_TYPE_COUNT; - switch (model.hparams.ftype) { - case 0: wtype = GGML_TYPE_F32; break; - case 1: wtype = GGML_TYPE_F16; break; - case 2: wtype = GGML_TYPE_Q4_0; break; - case 3: wtype = GGML_TYPE_Q4_1; break; - case 5: wtype = GGML_TYPE_Q4_2; break; - case 6: wtype = GGML_TYPE_Q4_3; break; - default: - { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", - __func__, fname.c_str(), model.hparams.ftype); - return ModelLoadResult::FAIL; - } + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return ModelLoadResult::FAIL; } - const ggml_type wtype2 = GGML_TYPE_F32; - auto & ctx = model.ctx; size_t ctx_size = 0; diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h index b8a1951e5..3713be7ce 100644 --- a/otherarch/otherarch.h +++ b/otherarch/otherarch.h @@ -23,7 +23,7 @@ struct gptj_hparams { int32_t n_head = 16; int32_t n_layer = 28; int32_t n_rot = 64; - int32_t f16 = 1; + int32_t ftype = 1; }; struct gptj_layer { @@ -120,7 +120,7 @@ struct gpt2_hparams { int32_t n_embd = 768; int32_t n_head = 12; int32_t n_layer = 12; - int32_t f16 = 1; + int32_t ftype = 1; }; struct gpt2_v1_layer { diff --git a/otherarch/tools/common-ggml.cpp b/otherarch/tools/common-ggml.cpp index 71406ced7..b46883610 100644 --- a/otherarch/tools/common-ggml.cpp +++ b/otherarch/tools/common-ggml.cpp @@ -1,26 +1,86 @@ -#include "otherarch/tools/common-ggml.h" - -#include "ggml.h" +#include "common-ggml.h" #include +static const std::map GGML_FTYPE_MAP = { + {"q4_0", GGML_FTYPE_MOSTLY_Q4_0}, + {"q4_1", GGML_FTYPE_MOSTLY_Q4_1}, + {"q4_2", GGML_FTYPE_MOSTLY_Q4_2}, + {"q4_3", GGML_FTYPE_MOSTLY_Q4_3}, + {"q5_0", GGML_FTYPE_MOSTLY_Q5_0}, + {"q5_1", GGML_FTYPE_MOSTLY_Q5_1}, + {"q8_0", GGML_FTYPE_MOSTLY_Q8_0}, +}; + +void ggml_print_ftypes(FILE * fp) { + for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) { + fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second); + } +} + +enum ggml_ftype ggml_parse_ftype(const char * str) { + enum ggml_ftype ftype; + if (str[0] == 'q') { + const auto it = GGML_FTYPE_MAP.find(str); + if (it == GGML_FTYPE_MAP.end()) { + fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str); + return GGML_FTYPE_UNKNOWN; + } + ftype = it->second; + } else { + ftype = (enum ggml_ftype) atoi(str); + } + + return ftype; +} + +enum ggml_type ggml_ftype_to_ggml_type(const enum ggml_ftype ftype) { + ggml_type wtype = GGML_TYPE_COUNT; + + switch (ftype) { + case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break; + case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break; + case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; + case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; + case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break; + case GGML_FTYPE_MOSTLY_Q4_3: wtype = GGML_TYPE_Q4_3; break; + case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; + case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break; + case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; + case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; + case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; + } + + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); + } + + return wtype; +} + bool ggml_common_quantize_0( std::ifstream & finp, std::ofstream & fout, - const ggml_mtype mtype, + const ggml_ftype ftype, const std::vector & to_quant, const std::vector & to_skip) { ggml_type qtype = GGML_TYPE_F32; - switch (mtype) { - case 2: qtype = GGML_TYPE_Q4_0; break; - case 3: qtype = GGML_TYPE_Q4_1; break; - case 5: qtype = GGML_TYPE_Q4_2; break; - case 6: qtype = GGML_TYPE_Q4_3; break; - default: + switch (ftype) { + case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break; + case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break; + case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break; + case GGML_FTYPE_MOSTLY_Q4_3: qtype = GGML_TYPE_Q4_3; break; + case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break; + case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break; + case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break; + case GGML_FTYPE_UNKNOWN: + case GGML_FTYPE_ALL_F32: + case GGML_FTYPE_MOSTLY_F16: + case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: { - fprintf(stderr, "%s: invalid model type %d\n", __func__, mtype); + fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); return false; } }; @@ -127,7 +187,7 @@ bool ggml_common_quantize_0( size_t cur_size = 0; std::vector hist_cur(1 << 4, 0); - switch (ttype) { + switch ((ggml_type) ttype) { case GGML_TYPE_Q4_0: { cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); @@ -144,7 +204,25 @@ bool ggml_common_quantize_0( { cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; - default: + case GGML_TYPE_Q5_0: + { + cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q5_1: + { + cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q8_0: + { + cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_Q8_1: + case GGML_TYPE_COUNT: { fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); return false; @@ -173,7 +251,7 @@ bool ggml_common_quantize_0( } printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); - printf("%s: quant size = %8.2f MB | mtype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, mtype, ggml_type_name(qtype)); + printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype)); { int64_t sum_all = 0; diff --git a/otherarch/tools/common-ggml.h b/otherarch/tools/common-ggml.h index 6299cfdb5..af57ea5d1 100644 --- a/otherarch/tools/common-ggml.h +++ b/otherarch/tools/common-ggml.h @@ -1,23 +1,37 @@ #pragma once +#include "ggml.h" + +#include #include #include #include // model file types -enum ggml_mtype { - GGML_MTYPE_ALL_F32 = 0, - GGML_MTYPE_MOSTLY_F16 = 1, // except 1d tensors - GGML_MTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - GGML_MTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors - GGML_MTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - GGML_MTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors - GGML_MTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors +enum ggml_ftype { + GGML_FTYPE_UNKNOWN = -1, + GGML_FTYPE_ALL_F32 = 0, + GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors + GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors }; +void ggml_print_ftypes(FILE * fp = stderr); + +enum ggml_ftype ggml_parse_ftype(const char * str); + +// TODO: temporary +enum ggml_type ggml_ftype_to_ggml_type(const enum ggml_ftype ftype); + bool ggml_common_quantize_0( std::ifstream & finp, std::ofstream & fout, - const ggml_mtype mtype, + const ggml_ftype ftype, const std::vector & to_quant, const std::vector & to_skip); \ No newline at end of file diff --git a/otherarch/tools/gpt2_quantize.cpp b/otherarch/tools/gpt2_quantize.cpp index 196db0cfd..7e9ad7739 100644 --- a/otherarch/tools/gpt2_quantize.cpp +++ b/otherarch/tools/gpt2_quantize.cpp @@ -1,6 +1,4 @@ -#include "ggml.h" - -#include "otherarch/utils.h" +#include "utils.h" #include "common-ggml.h" #include @@ -20,11 +18,11 @@ struct gpt2_hparams { int32_t n_embd = 768; int32_t n_head = 12; int32_t n_layer = 12; - int32_t f16 = 1; + int32_t ftype = 1; }; // quantize a model -bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) { +bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { gpt_vocab vocab; printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); @@ -62,21 +60,21 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - finp.read((char *) &hparams.f16, sizeof(hparams.f16)); + finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: f16 = %d\n", __func__, hparams.f16); + printf("%s: f16 = %d\n", __func__, hparams.ftype); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fout.write((char *) &mtype, sizeof(hparams.f16)); + fout.write((char *) &ftype, sizeof(hparams.ftype)); } // load vocab @@ -116,7 +114,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam "model/h.*/mlp/c_proj/w", }; - if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) { + if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str()); return false; } @@ -134,10 +132,7 @@ int main(int argc, char ** argv) { ggml_time_init(); if (argc != 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); - fprintf(stderr, " type = 2 - q4_0\n"); - fprintf(stderr, " type = 3 - q4_1\n"); - fprintf(stderr, " type = 5 - q4_2\n"); - fprintf(stderr, " type = 6 - q4_3\n"); + ggml_print_ftypes(stderr); return 1; } @@ -151,7 +146,7 @@ int main(int argc, char ** argv) { const std::string fname_inp = argv[1]; const std::string fname_out = argv[2]; - const int mtype = atoi(argv[3]); + const ggml_ftype ftype = ggml_parse_ftype(argv[3]); const int64_t t_main_start_us = ggml_time_us(); @@ -161,7 +156,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (!gpt2_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) { + if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } diff --git a/otherarch/tools/gptj_quantize.cpp b/otherarch/tools/gptj_quantize.cpp index f3ca21d71..a9f4232be 100644 --- a/otherarch/tools/gptj_quantize.cpp +++ b/otherarch/tools/gptj_quantize.cpp @@ -1,6 +1,6 @@ #include "ggml.h" -#include "otherarch/utils.h" +#include "utils.h" #include "common-ggml.h" #include @@ -25,7 +25,7 @@ struct gptj_hparams { }; // quantize a model -bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) { +bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { gpt_vocab vocab; printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); @@ -79,7 +79,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fout.write((char *) &mtype, sizeof(hparams.f16)); + fout.write((char *) &ftype, sizeof(hparams.f16)); } // load vocab @@ -114,7 +114,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam ".*weight", }; - if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) { + if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str()); return false; } @@ -132,10 +132,7 @@ int main(int argc, char ** argv) { ggml_time_init(); if (argc != 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); - fprintf(stderr, " type = 2 - q4_0\n"); - fprintf(stderr, " type = 3 - q4_1\n"); - fprintf(stderr, " type = 5 - q4_2\n"); - fprintf(stderr, " type = 6 - q4_3\n"); + ggml_print_ftypes(stderr); return 1; } @@ -149,7 +146,7 @@ int main(int argc, char ** argv) { const std::string fname_inp = argv[1]; const std::string fname_out = argv[2]; - const int mtype = atoi(argv[3]); + const ggml_ftype ftype = ggml_parse_ftype(argv[3]); const int64_t t_main_start_us = ggml_time_us(); @@ -159,7 +156,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (!gptj_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) { + if (!gptj_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } diff --git a/otherarch/tools/neox_quantize.cpp b/otherarch/tools/neox_quantize.cpp index d32282eac..b719a5dc5 100644 --- a/otherarch/tools/neox_quantize.cpp +++ b/otherarch/tools/neox_quantize.cpp @@ -1,6 +1,6 @@ #include "ggml.h" -#include "otherarch/utils.h" +#include "utils.h" #include "common-ggml.h" #include @@ -25,7 +25,7 @@ struct stablelm_hparams { }; // quantize a model -bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) { +bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { gpt_vocab vocab; printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); @@ -79,7 +79,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fout.write((char *) &mtype, sizeof(hparams.ftype)); + fout.write((char *) &ftype, sizeof(hparams.ftype)); } // load vocab @@ -106,7 +106,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string & ".*weight", }; - if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) { + if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) { fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str()); return false; } @@ -121,12 +121,10 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string & // ./stablelm2-quantize models/stablelm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type // int main(int argc, char ** argv) { + ggml_time_init(); if (argc != 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); - fprintf(stderr, " type = 2 - q4_0\n"); - fprintf(stderr, " type = 3 - q4_1\n"); - fprintf(stderr, " type = 5 - q4_2\n"); - fprintf(stderr, " type = 6 - q4_3\n"); + ggml_print_ftypes(stderr); return 1; } @@ -140,7 +138,7 @@ int main(int argc, char ** argv) { const std::string fname_inp = argv[1]; const std::string fname_out = argv[2]; - const int mtype = atoi(argv[3]); + const ggml_ftype ftype = ggml_parse_ftype(argv[3]); const int64_t t_main_start_us = ggml_time_us(); @@ -150,7 +148,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (!stablelm_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) { + if (!stablelm_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; }