integrated q5 formats
This commit is contained in:
parent
e8a389f85b
commit
032a171867
13 changed files with 184 additions and 129 deletions
5
Makefile
5
Makefile
|
@ -1,5 +1,6 @@
|
|||
default: koboldcpp koboldcpp_noavx2 koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast
|
||||
simple: koboldcpp koboldcpp_noavx2
|
||||
tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox
|
||||
dev: koboldcpp_openblas
|
||||
|
||||
|
||||
|
@ -45,8 +46,8 @@ endif
|
|||
#
|
||||
|
||||
# keep standard at C11 and C++11
|
||||
CFLAGS = -I. -I./include -I./include/CL -Ofast -DNDEBUG -std=c11 -fPIC
|
||||
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -Ofast -DNDEBUG -std=c++11 -fPIC
|
||||
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC
|
||||
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC
|
||||
LDFLAGS =
|
||||
|
||||
# these are used on windows, to build some libraries with extra old device compatibility
|
||||
|
|
|
@ -9,19 +9,21 @@
|
|||
|
||||
#include <time.h>
|
||||
#include "model_adapter.h"
|
||||
#include "otherarch/otherarch.h"
|
||||
#include "otherarch.h"
|
||||
|
||||
//for easier compilation
|
||||
#include "llamaextra.cpp"
|
||||
|
||||
//concat source files into one file for compilation purposes
|
||||
#include "otherarch/utils.cpp"
|
||||
#include "otherarch/gptj_v1.cpp"
|
||||
#include "otherarch/gptj_v2.cpp"
|
||||
#include "otherarch/gpt2_v1.cpp"
|
||||
#include "otherarch/gpt2_v2.cpp"
|
||||
#include "otherarch/rwkv.cpp"
|
||||
#include "otherarch/neox.cpp"
|
||||
#include "common-ggml.cpp"
|
||||
#include "utils.cpp"
|
||||
#include "gptj_v1.cpp"
|
||||
#include "gptj_v2.cpp"
|
||||
#include "gpt2_v1.cpp"
|
||||
#include "gpt2_v2.cpp"
|
||||
#include "rwkv.cpp"
|
||||
#include "neox.cpp"
|
||||
|
||||
|
||||
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
|
||||
static FileFormat file_format = FileFormat::BADFORMAT;
|
||||
|
|
|
@ -48,7 +48,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
|
|||
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
||||
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
||||
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
//used to expand KV size if needed
|
||||
desiredMaxCtx = std::max(hparams.n_ctx,desiredMaxCtx);
|
||||
|
@ -58,7 +58,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
|
|||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
||||
printf("%s: f16 = %d\n", __func__, hparams.ftype);
|
||||
}
|
||||
|
||||
// load vocab
|
||||
|
@ -87,7 +87,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
|
|||
|
||||
// for the big tensors, we have the option to store the data in 16-bit floats
|
||||
// in order to save memory and also to speed up the computation
|
||||
const ggml_v1_type wtype = model.hparams.f16 ? GGML_V1_TYPE_F16 : GGML_V1_TYPE_F32;
|
||||
const ggml_v1_type wtype = GGML_V1_TYPE_F16;
|
||||
|
||||
auto & ctx = model.ctx;
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#include "otherarch.h"
|
||||
|
||||
#include "utils.h"
|
||||
#include "common-ggml.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
@ -47,14 +48,14 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
|||
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
||||
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
||||
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
||||
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
||||
}
|
||||
|
||||
// load vocab
|
||||
|
@ -85,24 +86,13 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
|||
|
||||
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||
// in order to save memory and also to speed up the computation
|
||||
ggml_type wtype = GGML_TYPE_COUNT;
|
||||
switch (model.hparams.f16) {
|
||||
case 0: wtype = GGML_TYPE_F32; break;
|
||||
case 1: wtype = GGML_TYPE_F16; break;
|
||||
case 2: wtype = GGML_TYPE_Q4_0; break;
|
||||
case 3: wtype = GGML_TYPE_Q4_1; break;
|
||||
case 5: wtype = GGML_TYPE_Q4_2; break;
|
||||
case 6: wtype = GGML_TYPE_Q4_3; break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
|
||||
__func__, fname.c_str(), model.hparams.f16);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
||||
if (wtype == GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
||||
__func__, fname.c_str(), model.hparams.ftype);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
const ggml_type wtype2 = GGML_TYPE_F32;
|
||||
|
||||
auto & ctx = model.ctx;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
|
|
|
@ -48,7 +48,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
|
|||
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
||||
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
||||
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
|
@ -56,7 +56,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
|
|||
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
|
||||
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
||||
printf("%s: f16 = %d\n", __func__, hparams.ftype);
|
||||
}
|
||||
|
||||
// load vocab
|
||||
|
@ -86,7 +86,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
|
|||
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||
// in order to save memory and also to speed up the computation
|
||||
ggml_v1_type wtype = GGML_V1_TYPE_COUNT;
|
||||
switch (model.hparams.f16) {
|
||||
switch (model.hparams.ftype) {
|
||||
case 0: wtype = GGML_V1_TYPE_F32; break;
|
||||
case 1: wtype = GGML_V1_TYPE_F16; break;
|
||||
case 2: wtype = GGML_V1_TYPE_Q4_0; break;
|
||||
|
@ -94,7 +94,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
|
|||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
|
||||
__func__, fname.c_str(), model.hparams.f16);
|
||||
__func__, fname.c_str(), model.hparams.ftype);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#include "otherarch.h"
|
||||
|
||||
#include "utils.h"
|
||||
#include "common-ggml.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
@ -48,7 +49,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|||
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
||||
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
||||
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
|
@ -56,7 +57,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|||
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
|
||||
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
||||
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
||||
}
|
||||
|
||||
// load vocab
|
||||
|
@ -85,24 +86,13 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|||
|
||||
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||
// in order to save memory and also to speed up the computation
|
||||
ggml_type wtype = GGML_TYPE_COUNT;
|
||||
switch (model.hparams.f16) {
|
||||
case 0: wtype = GGML_TYPE_F32; break;
|
||||
case 1: wtype = GGML_TYPE_F16; break;
|
||||
case 2: wtype = GGML_TYPE_Q4_0; break;
|
||||
case 3: wtype = GGML_TYPE_Q4_1; break;
|
||||
case 5: wtype = GGML_TYPE_Q4_2; break;
|
||||
case 6: wtype = GGML_TYPE_Q4_3; break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
|
||||
__func__, fname.c_str(), model.hparams.f16);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
||||
if (wtype == GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
||||
__func__, fname.c_str(), model.hparams.ftype);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
const ggml_type wtype2 = GGML_TYPE_F32;
|
||||
|
||||
auto & ctx = model.ctx;
|
||||
|
||||
auto memory_type = GGML_TYPE_F16;
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#include "otherarch.h"
|
||||
|
||||
#include "utils.h"
|
||||
#include "common-ggml.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
@ -76,24 +77,13 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model &
|
|||
|
||||
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||
// in order to save memory and also to speed up the computation
|
||||
ggml_type wtype = GGML_TYPE_COUNT;
|
||||
switch (model.hparams.ftype) {
|
||||
case 0: wtype = GGML_TYPE_F32; break;
|
||||
case 1: wtype = GGML_TYPE_F16; break;
|
||||
case 2: wtype = GGML_TYPE_Q4_0; break;
|
||||
case 3: wtype = GGML_TYPE_Q4_1; break;
|
||||
case 5: wtype = GGML_TYPE_Q4_2; break;
|
||||
case 6: wtype = GGML_TYPE_Q4_3; break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
||||
__func__, fname.c_str(), model.hparams.ftype);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
||||
if (wtype == GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
||||
__func__, fname.c_str(), model.hparams.ftype);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
const ggml_type wtype2 = GGML_TYPE_F32;
|
||||
|
||||
auto & ctx = model.ctx;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
|
|
|
@ -23,7 +23,7 @@ struct gptj_hparams {
|
|||
int32_t n_head = 16;
|
||||
int32_t n_layer = 28;
|
||||
int32_t n_rot = 64;
|
||||
int32_t f16 = 1;
|
||||
int32_t ftype = 1;
|
||||
};
|
||||
|
||||
struct gptj_layer {
|
||||
|
@ -120,7 +120,7 @@ struct gpt2_hparams {
|
|||
int32_t n_embd = 768;
|
||||
int32_t n_head = 12;
|
||||
int32_t n_layer = 12;
|
||||
int32_t f16 = 1;
|
||||
int32_t ftype = 1;
|
||||
};
|
||||
|
||||
struct gpt2_v1_layer {
|
||||
|
|
|
@ -1,26 +1,86 @@
|
|||
#include "otherarch/tools/common-ggml.h"
|
||||
|
||||
#include "ggml.h"
|
||||
#include "common-ggml.h"
|
||||
|
||||
#include <regex>
|
||||
|
||||
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
|
||||
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
|
||||
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
|
||||
{"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
|
||||
{"q4_3", GGML_FTYPE_MOSTLY_Q4_3},
|
||||
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
||||
};
|
||||
|
||||
void ggml_print_ftypes(FILE * fp) {
|
||||
for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
|
||||
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
|
||||
}
|
||||
}
|
||||
|
||||
enum ggml_ftype ggml_parse_ftype(const char * str) {
|
||||
enum ggml_ftype ftype;
|
||||
if (str[0] == 'q') {
|
||||
const auto it = GGML_FTYPE_MAP.find(str);
|
||||
if (it == GGML_FTYPE_MAP.end()) {
|
||||
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
|
||||
return GGML_FTYPE_UNKNOWN;
|
||||
}
|
||||
ftype = it->second;
|
||||
} else {
|
||||
ftype = (enum ggml_ftype) atoi(str);
|
||||
}
|
||||
|
||||
return ftype;
|
||||
}
|
||||
|
||||
enum ggml_type ggml_ftype_to_ggml_type(const enum ggml_ftype ftype) {
|
||||
ggml_type wtype = GGML_TYPE_COUNT;
|
||||
|
||||
switch (ftype) {
|
||||
case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
|
||||
case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_3: wtype = GGML_TYPE_Q4_3; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||
}
|
||||
|
||||
if (wtype == GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
||||
}
|
||||
|
||||
return wtype;
|
||||
}
|
||||
|
||||
bool ggml_common_quantize_0(
|
||||
std::ifstream & finp,
|
||||
std::ofstream & fout,
|
||||
const ggml_mtype mtype,
|
||||
const ggml_ftype ftype,
|
||||
const std::vector<std::string> & to_quant,
|
||||
const std::vector<std::string> & to_skip) {
|
||||
|
||||
ggml_type qtype = GGML_TYPE_F32;
|
||||
|
||||
switch (mtype) {
|
||||
case 2: qtype = GGML_TYPE_Q4_0; break;
|
||||
case 3: qtype = GGML_TYPE_Q4_1; break;
|
||||
case 5: qtype = GGML_TYPE_Q4_2; break;
|
||||
case 6: qtype = GGML_TYPE_Q4_3; break;
|
||||
default:
|
||||
switch (ftype) {
|
||||
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_3: qtype = GGML_TYPE_Q4_3; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
||||
case GGML_FTYPE_UNKNOWN:
|
||||
case GGML_FTYPE_ALL_F32:
|
||||
case GGML_FTYPE_MOSTLY_F16:
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
{
|
||||
fprintf(stderr, "%s: invalid model type %d\n", __func__, mtype);
|
||||
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
@ -127,7 +187,7 @@ bool ggml_common_quantize_0(
|
|||
size_t cur_size = 0;
|
||||
std::vector<int64_t> hist_cur(1 << 4, 0);
|
||||
|
||||
switch (ttype) {
|
||||
switch ((ggml_type) ttype) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
|
@ -144,7 +204,25 @@ bool ggml_common_quantize_0(
|
|||
{
|
||||
cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
default:
|
||||
case GGML_TYPE_Q5_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
{
|
||||
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_COUNT:
|
||||
{
|
||||
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
||||
return false;
|
||||
|
@ -173,7 +251,7 @@ bool ggml_common_quantize_0(
|
|||
}
|
||||
|
||||
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
||||
printf("%s: quant size = %8.2f MB | mtype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, mtype, ggml_type_name(qtype));
|
||||
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
|
||||
|
||||
{
|
||||
int64_t sum_all = 0;
|
||||
|
|
|
@ -1,23 +1,37 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <map>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
// model file types
|
||||
enum ggml_mtype {
|
||||
GGML_MTYPE_ALL_F32 = 0,
|
||||
GGML_MTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
GGML_MTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
GGML_MTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
GGML_MTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
GGML_MTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
||||
GGML_MTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
|
||||
enum ggml_ftype {
|
||||
GGML_FTYPE_UNKNOWN = -1,
|
||||
GGML_FTYPE_ALL_F32 = 0,
|
||||
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
};
|
||||
|
||||
void ggml_print_ftypes(FILE * fp = stderr);
|
||||
|
||||
enum ggml_ftype ggml_parse_ftype(const char * str);
|
||||
|
||||
// TODO: temporary
|
||||
enum ggml_type ggml_ftype_to_ggml_type(const enum ggml_ftype ftype);
|
||||
|
||||
bool ggml_common_quantize_0(
|
||||
std::ifstream & finp,
|
||||
std::ofstream & fout,
|
||||
const ggml_mtype mtype,
|
||||
const ggml_ftype ftype,
|
||||
const std::vector<std::string> & to_quant,
|
||||
const std::vector<std::string> & to_skip);
|
|
@ -1,6 +1,4 @@
|
|||
#include "ggml.h"
|
||||
|
||||
#include "otherarch/utils.h"
|
||||
#include "utils.h"
|
||||
#include "common-ggml.h"
|
||||
|
||||
#include <cassert>
|
||||
|
@ -20,11 +18,11 @@ struct gpt2_hparams {
|
|||
int32_t n_embd = 768;
|
||||
int32_t n_head = 12;
|
||||
int32_t n_layer = 12;
|
||||
int32_t f16 = 1;
|
||||
int32_t ftype = 1;
|
||||
};
|
||||
|
||||
// quantize a model
|
||||
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
|
||||
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
|
||||
gpt_vocab vocab;
|
||||
|
||||
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
||||
|
@ -62,21 +60,21 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
|
|||
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
||||
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
|
||||
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
||||
printf("%s: f16 = %d\n", __func__, hparams.ftype);
|
||||
|
||||
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
||||
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
fout.write((char *) &mtype, sizeof(hparams.f16));
|
||||
fout.write((char *) &ftype, sizeof(hparams.ftype));
|
||||
}
|
||||
|
||||
// load vocab
|
||||
|
@ -116,7 +114,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
|
|||
"model/h.*/mlp/c_proj/w",
|
||||
};
|
||||
|
||||
if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
|
||||
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
||||
return false;
|
||||
}
|
||||
|
@ -134,10 +132,7 @@ int main(int argc, char ** argv) {
|
|||
ggml_time_init();
|
||||
if (argc != 4) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||
fprintf(stderr, " type = 2 - q4_0\n");
|
||||
fprintf(stderr, " type = 3 - q4_1\n");
|
||||
fprintf(stderr, " type = 5 - q4_2\n");
|
||||
fprintf(stderr, " type = 6 - q4_3\n");
|
||||
ggml_print_ftypes(stderr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -151,7 +146,7 @@ int main(int argc, char ** argv) {
|
|||
const std::string fname_inp = argv[1];
|
||||
const std::string fname_out = argv[2];
|
||||
|
||||
const int mtype = atoi(argv[3]);
|
||||
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
|
||||
|
@ -161,7 +156,7 @@ int main(int argc, char ** argv) {
|
|||
{
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
|
||||
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#include "ggml.h"
|
||||
|
||||
#include "otherarch/utils.h"
|
||||
#include "utils.h"
|
||||
#include "common-ggml.h"
|
||||
|
||||
#include <cassert>
|
||||
|
@ -25,7 +25,7 @@ struct gptj_hparams {
|
|||
};
|
||||
|
||||
// quantize a model
|
||||
bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
|
||||
bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
|
||||
gpt_vocab vocab;
|
||||
|
||||
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
||||
|
@ -79,7 +79,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
|
|||
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
||||
fout.write((char *) &mtype, sizeof(hparams.f16));
|
||||
fout.write((char *) &ftype, sizeof(hparams.f16));
|
||||
}
|
||||
|
||||
// load vocab
|
||||
|
@ -114,7 +114,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
|
|||
".*weight",
|
||||
};
|
||||
|
||||
if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
|
||||
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
||||
return false;
|
||||
}
|
||||
|
@ -132,10 +132,7 @@ int main(int argc, char ** argv) {
|
|||
ggml_time_init();
|
||||
if (argc != 4) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||
fprintf(stderr, " type = 2 - q4_0\n");
|
||||
fprintf(stderr, " type = 3 - q4_1\n");
|
||||
fprintf(stderr, " type = 5 - q4_2\n");
|
||||
fprintf(stderr, " type = 6 - q4_3\n");
|
||||
ggml_print_ftypes(stderr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -149,7 +146,7 @@ int main(int argc, char ** argv) {
|
|||
const std::string fname_inp = argv[1];
|
||||
const std::string fname_out = argv[2];
|
||||
|
||||
const int mtype = atoi(argv[3]);
|
||||
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
|
||||
|
@ -159,7 +156,7 @@ int main(int argc, char ** argv) {
|
|||
{
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
if (!gptj_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
|
||||
if (!gptj_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#include "ggml.h"
|
||||
|
||||
#include "otherarch/utils.h"
|
||||
#include "utils.h"
|
||||
#include "common-ggml.h"
|
||||
|
||||
#include <cassert>
|
||||
|
@ -25,7 +25,7 @@ struct stablelm_hparams {
|
|||
};
|
||||
|
||||
// quantize a model
|
||||
bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
|
||||
bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
|
||||
gpt_vocab vocab;
|
||||
|
||||
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
||||
|
@ -79,7 +79,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
|
|||
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
||||
fout.write((char *) &mtype, sizeof(hparams.ftype));
|
||||
fout.write((char *) &ftype, sizeof(hparams.ftype));
|
||||
}
|
||||
|
||||
// load vocab
|
||||
|
@ -106,7 +106,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
|
|||
".*weight",
|
||||
};
|
||||
|
||||
if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
|
||||
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
||||
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
||||
return false;
|
||||
}
|
||||
|
@ -121,12 +121,10 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
|
|||
// ./stablelm2-quantize models/stablelm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
|
||||
//
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
if (argc != 4) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||
fprintf(stderr, " type = 2 - q4_0\n");
|
||||
fprintf(stderr, " type = 3 - q4_1\n");
|
||||
fprintf(stderr, " type = 5 - q4_2\n");
|
||||
fprintf(stderr, " type = 6 - q4_3\n");
|
||||
ggml_print_ftypes(stderr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -140,7 +138,7 @@ int main(int argc, char ** argv) {
|
|||
const std::string fname_inp = argv[1];
|
||||
const std::string fname_out = argv[2];
|
||||
|
||||
const int mtype = atoi(argv[3]);
|
||||
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
|
||||
|
@ -150,7 +148,7 @@ int main(int argc, char ** argv) {
|
|||
{
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
if (!stablelm_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
|
||||
if (!stablelm_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue