integrated q5 formats

This commit is contained in:
Concedo 2023-04-28 12:58:39 +08:00
parent e8a389f85b
commit 032a171867
13 changed files with 184 additions and 129 deletions

View file

@ -1,5 +1,6 @@
default: koboldcpp koboldcpp_noavx2 koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast
simple: koboldcpp koboldcpp_noavx2
tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox
dev: koboldcpp_openblas
@ -45,8 +46,8 @@ endif
#
# keep standard at C11 and C++11
CFLAGS = -I. -I./include -I./include/CL -Ofast -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -Ofast -DNDEBUG -std=c++11 -fPIC
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC
LDFLAGS =
# these are used on windows, to build some libraries with extra old device compatibility

View file

@ -9,19 +9,21 @@
#include <time.h>
#include "model_adapter.h"
#include "otherarch/otherarch.h"
#include "otherarch.h"
//for easier compilation
#include "llamaextra.cpp"
//concat source files into one file for compilation purposes
#include "otherarch/utils.cpp"
#include "otherarch/gptj_v1.cpp"
#include "otherarch/gptj_v2.cpp"
#include "otherarch/gpt2_v1.cpp"
#include "otherarch/gpt2_v2.cpp"
#include "otherarch/rwkv.cpp"
#include "otherarch/neox.cpp"
#include "common-ggml.cpp"
#include "utils.cpp"
#include "gptj_v1.cpp"
#include "gptj_v2.cpp"
#include "gpt2_v1.cpp"
#include "gpt2_v2.cpp"
#include "rwkv.cpp"
#include "neox.cpp"
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
static FileFormat file_format = FileFormat::BADFORMAT;

View file

@ -48,7 +48,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
//used to expand KV size if needed
desiredMaxCtx = std::max(hparams.n_ctx,desiredMaxCtx);
@ -58,7 +58,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: f16 = %d\n", __func__, hparams.f16);
printf("%s: f16 = %d\n", __func__, hparams.ftype);
}
// load vocab
@ -87,7 +87,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
// for the big tensors, we have the option to store the data in 16-bit floats
// in order to save memory and also to speed up the computation
const ggml_v1_type wtype = model.hparams.f16 ? GGML_V1_TYPE_F16 : GGML_V1_TYPE_F32;
const ggml_v1_type wtype = GGML_V1_TYPE_F16;
auto & ctx = model.ctx;

View file

@ -2,6 +2,7 @@
#include "otherarch.h"
#include "utils.h"
#include "common-ggml.h"
#include <cassert>
#include <cmath>
@ -47,14 +48,14 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: f16 = %d\n", __func__, hparams.f16);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
}
// load vocab
@ -85,24 +86,13 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
ggml_type wtype = GGML_TYPE_COUNT;
switch (model.hparams.f16) {
case 0: wtype = GGML_TYPE_F32; break;
case 1: wtype = GGML_TYPE_F16; break;
case 2: wtype = GGML_TYPE_Q4_0; break;
case 3: wtype = GGML_TYPE_Q4_1; break;
case 5: wtype = GGML_TYPE_Q4_2; break;
case 6: wtype = GGML_TYPE_Q4_3; break;
default:
{
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
__func__, fname.c_str(), model.hparams.f16);
return ModelLoadResult::FAIL;
}
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
if (wtype == GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
__func__, fname.c_str(), model.hparams.ftype);
return ModelLoadResult::FAIL;
}
const ggml_type wtype2 = GGML_TYPE_F32;
auto & ctx = model.ctx;
size_t ctx_size = 0;

View file

@ -48,7 +48,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
@ -56,7 +56,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
printf("%s: f16 = %d\n", __func__, hparams.f16);
printf("%s: f16 = %d\n", __func__, hparams.ftype);
}
// load vocab
@ -86,7 +86,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
ggml_v1_type wtype = GGML_V1_TYPE_COUNT;
switch (model.hparams.f16) {
switch (model.hparams.ftype) {
case 0: wtype = GGML_V1_TYPE_F32; break;
case 1: wtype = GGML_V1_TYPE_F16; break;
case 2: wtype = GGML_V1_TYPE_Q4_0; break;
@ -94,7 +94,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
default:
{
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
__func__, fname.c_str(), model.hparams.f16);
__func__, fname.c_str(), model.hparams.ftype);
return ModelLoadResult::FAIL;
}
}

View file

@ -2,6 +2,7 @@
#include "otherarch.h"
#include "utils.h"
#include "common-ggml.h"
#include <cassert>
#include <cmath>
@ -48,7 +49,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
@ -56,7 +57,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
printf("%s: f16 = %d\n", __func__, hparams.f16);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
}
// load vocab
@ -85,24 +86,13 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
ggml_type wtype = GGML_TYPE_COUNT;
switch (model.hparams.f16) {
case 0: wtype = GGML_TYPE_F32; break;
case 1: wtype = GGML_TYPE_F16; break;
case 2: wtype = GGML_TYPE_Q4_0; break;
case 3: wtype = GGML_TYPE_Q4_1; break;
case 5: wtype = GGML_TYPE_Q4_2; break;
case 6: wtype = GGML_TYPE_Q4_3; break;
default:
{
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
__func__, fname.c_str(), model.hparams.f16);
return ModelLoadResult::FAIL;
}
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
if (wtype == GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
__func__, fname.c_str(), model.hparams.ftype);
return ModelLoadResult::FAIL;
}
const ggml_type wtype2 = GGML_TYPE_F32;
auto & ctx = model.ctx;
auto memory_type = GGML_TYPE_F16;

View file

@ -2,6 +2,7 @@
#include "otherarch.h"
#include "utils.h"
#include "common-ggml.h"
#include <cassert>
#include <cmath>
@ -76,24 +77,13 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model &
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
ggml_type wtype = GGML_TYPE_COUNT;
switch (model.hparams.ftype) {
case 0: wtype = GGML_TYPE_F32; break;
case 1: wtype = GGML_TYPE_F16; break;
case 2: wtype = GGML_TYPE_Q4_0; break;
case 3: wtype = GGML_TYPE_Q4_1; break;
case 5: wtype = GGML_TYPE_Q4_2; break;
case 6: wtype = GGML_TYPE_Q4_3; break;
default:
{
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
__func__, fname.c_str(), model.hparams.ftype);
return ModelLoadResult::FAIL;
}
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
if (wtype == GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
__func__, fname.c_str(), model.hparams.ftype);
return ModelLoadResult::FAIL;
}
const ggml_type wtype2 = GGML_TYPE_F32;
auto & ctx = model.ctx;
size_t ctx_size = 0;

View file

@ -23,7 +23,7 @@ struct gptj_hparams {
int32_t n_head = 16;
int32_t n_layer = 28;
int32_t n_rot = 64;
int32_t f16 = 1;
int32_t ftype = 1;
};
struct gptj_layer {
@ -120,7 +120,7 @@ struct gpt2_hparams {
int32_t n_embd = 768;
int32_t n_head = 12;
int32_t n_layer = 12;
int32_t f16 = 1;
int32_t ftype = 1;
};
struct gpt2_v1_layer {

View file

@ -1,26 +1,86 @@
#include "otherarch/tools/common-ggml.h"
#include "ggml.h"
#include "common-ggml.h"
#include <regex>
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
{"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
{"q4_3", GGML_FTYPE_MOSTLY_Q4_3},
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
};
void ggml_print_ftypes(FILE * fp) {
for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
}
}
enum ggml_ftype ggml_parse_ftype(const char * str) {
enum ggml_ftype ftype;
if (str[0] == 'q') {
const auto it = GGML_FTYPE_MAP.find(str);
if (it == GGML_FTYPE_MAP.end()) {
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
return GGML_FTYPE_UNKNOWN;
}
ftype = it->second;
} else {
ftype = (enum ggml_ftype) atoi(str);
}
return ftype;
}
enum ggml_type ggml_ftype_to_ggml_type(const enum ggml_ftype ftype) {
ggml_type wtype = GGML_TYPE_COUNT;
switch (ftype) {
case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break;
case GGML_FTYPE_MOSTLY_Q4_3: wtype = GGML_TYPE_Q4_3; break;
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
}
if (wtype == GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
}
return wtype;
}
bool ggml_common_quantize_0(
std::ifstream & finp,
std::ofstream & fout,
const ggml_mtype mtype,
const ggml_ftype ftype,
const std::vector<std::string> & to_quant,
const std::vector<std::string> & to_skip) {
ggml_type qtype = GGML_TYPE_F32;
switch (mtype) {
case 2: qtype = GGML_TYPE_Q4_0; break;
case 3: qtype = GGML_TYPE_Q4_1; break;
case 5: qtype = GGML_TYPE_Q4_2; break;
case 6: qtype = GGML_TYPE_Q4_3; break;
default:
switch (ftype) {
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
case GGML_FTYPE_MOSTLY_Q4_3: qtype = GGML_TYPE_Q4_3; break;
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
case GGML_FTYPE_UNKNOWN:
case GGML_FTYPE_ALL_F32:
case GGML_FTYPE_MOSTLY_F16:
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
{
fprintf(stderr, "%s: invalid model type %d\n", __func__, mtype);
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false;
}
};
@ -127,7 +187,7 @@ bool ggml_common_quantize_0(
size_t cur_size = 0;
std::vector<int64_t> hist_cur(1 << 4, 0);
switch (ttype) {
switch ((ggml_type) ttype) {
case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
@ -144,7 +204,25 @@ bool ggml_common_quantize_0(
{
cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
default:
case GGML_TYPE_Q5_0:
{
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_1:
{
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q8_0:
{
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_Q8_1:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
return false;
@ -173,7 +251,7 @@ bool ggml_common_quantize_0(
}
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
printf("%s: quant size = %8.2f MB | mtype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, mtype, ggml_type_name(qtype));
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
{
int64_t sum_all = 0;

View file

@ -1,23 +1,37 @@
#pragma once
#include "ggml.h"
#include <map>
#include <fstream>
#include <vector>
#include <string>
// model file types
enum ggml_mtype {
GGML_MTYPE_ALL_F32 = 0,
GGML_MTYPE_MOSTLY_F16 = 1, // except 1d tensors
GGML_MTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
GGML_MTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
GGML_MTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
GGML_MTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
GGML_MTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
enum ggml_ftype {
GGML_FTYPE_UNKNOWN = -1,
GGML_FTYPE_ALL_F32 = 0,
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
};
void ggml_print_ftypes(FILE * fp = stderr);
enum ggml_ftype ggml_parse_ftype(const char * str);
// TODO: temporary
enum ggml_type ggml_ftype_to_ggml_type(const enum ggml_ftype ftype);
bool ggml_common_quantize_0(
std::ifstream & finp,
std::ofstream & fout,
const ggml_mtype mtype,
const ggml_ftype ftype,
const std::vector<std::string> & to_quant,
const std::vector<std::string> & to_skip);

View file

@ -1,6 +1,4 @@
#include "ggml.h"
#include "otherarch/utils.h"
#include "utils.h"
#include "common-ggml.h"
#include <cassert>
@ -20,11 +18,11 @@ struct gpt2_hparams {
int32_t n_embd = 768;
int32_t n_head = 12;
int32_t n_layer = 12;
int32_t f16 = 1;
int32_t ftype = 1;
};
// quantize a model
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -62,21 +60,21 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: f16 = %d\n", __func__, hparams.f16);
printf("%s: f16 = %d\n", __func__, hparams.ftype);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &mtype, sizeof(hparams.f16));
fout.write((char *) &ftype, sizeof(hparams.ftype));
}
// load vocab
@ -116,7 +114,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
"model/h.*/mlp/c_proj/w",
};
if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
return false;
}
@ -134,10 +132,7 @@ int main(int argc, char ** argv) {
ggml_time_init();
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
fprintf(stderr, " type = 2 - q4_0\n");
fprintf(stderr, " type = 3 - q4_1\n");
fprintf(stderr, " type = 5 - q4_2\n");
fprintf(stderr, " type = 6 - q4_3\n");
ggml_print_ftypes(stderr);
return 1;
}
@ -151,7 +146,7 @@ int main(int argc, char ** argv) {
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
const int mtype = atoi(argv[3]);
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
@ -161,7 +156,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = ggml_time_us();
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}

View file

@ -1,6 +1,6 @@
#include "ggml.h"
#include "otherarch/utils.h"
#include "utils.h"
#include "common-ggml.h"
#include <cassert>
@ -25,7 +25,7 @@ struct gptj_hparams {
};
// quantize a model
bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -79,7 +79,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fout.write((char *) &mtype, sizeof(hparams.f16));
fout.write((char *) &ftype, sizeof(hparams.f16));
}
// load vocab
@ -114,7 +114,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
".*weight",
};
if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
return false;
}
@ -132,10 +132,7 @@ int main(int argc, char ** argv) {
ggml_time_init();
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
fprintf(stderr, " type = 2 - q4_0\n");
fprintf(stderr, " type = 3 - q4_1\n");
fprintf(stderr, " type = 5 - q4_2\n");
fprintf(stderr, " type = 6 - q4_3\n");
ggml_print_ftypes(stderr);
return 1;
}
@ -149,7 +146,7 @@ int main(int argc, char ** argv) {
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
const int mtype = atoi(argv[3]);
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
@ -159,7 +156,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = ggml_time_us();
if (!gptj_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
if (!gptj_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}

View file

@ -1,6 +1,6 @@
#include "ggml.h"
#include "otherarch/utils.h"
#include "utils.h"
#include "common-ggml.h"
#include <cassert>
@ -25,7 +25,7 @@ struct stablelm_hparams {
};
// quantize a model
bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -79,7 +79,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fout.write((char *) &mtype, sizeof(hparams.ftype));
fout.write((char *) &ftype, sizeof(hparams.ftype));
}
// load vocab
@ -106,7 +106,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
".*weight",
};
if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
return false;
}
@ -121,12 +121,10 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
// ./stablelm2-quantize models/stablelm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
//
int main(int argc, char ** argv) {
ggml_time_init();
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
fprintf(stderr, " type = 2 - q4_0\n");
fprintf(stderr, " type = 3 - q4_1\n");
fprintf(stderr, " type = 5 - q4_2\n");
fprintf(stderr, " type = 6 - q4_3\n");
ggml_print_ftypes(stderr);
return 1;
}
@ -140,7 +138,7 @@ int main(int argc, char ** argv) {
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
const int mtype = atoi(argv[3]);
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
@ -150,7 +148,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = ggml_time_us();
if (!stablelm_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
if (!stablelm_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}