custom quantization schemas
This commit is contained in:
parent
31e2f5668c
commit
dbe6483e7e
3 changed files with 100 additions and 6 deletions
|
@ -49,11 +49,11 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
{ "CUSTOM", LLAMA_FTYPE_CUSTOM, "per-layer scheme from file (quant.cfg)", },
|
||||
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
||||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||
};
|
||||
|
||||
|
||||
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||
std::string ftype_str;
|
||||
|
||||
|
@ -247,6 +247,60 @@ static bool parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool read_custom_quant_config(const std::string& filename, llama_model_quantize_ftype_override& override) {
|
||||
std::ifstream file(filename);
|
||||
std::string line;
|
||||
std::vector<std::string> names;
|
||||
std::vector<ggml_type> types;
|
||||
|
||||
printf("%s: reading custom quantization scheme from %s:\n", __func__, filename.c_str());
|
||||
|
||||
if (!file.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open file: '%s'\n", __func__, filename.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
while (getline(file, line)) {
|
||||
// Skip empty lines and comments
|
||||
if (line.empty() || line[0] == '#') continue;
|
||||
printf(" %s\n", line.c_str());
|
||||
|
||||
// default file type
|
||||
if (line.find("ftype=") == 0) {
|
||||
int ftype = std::stoi(line.substr(6));
|
||||
override.default_ftype = static_cast<llama_ftype>(ftype);
|
||||
printf(" default ftype = %i\n", ftype);
|
||||
continue;
|
||||
}
|
||||
|
||||
// tensor overrides
|
||||
size_t pos = line.find('=');
|
||||
if (pos != std::string::npos) {
|
||||
std::string name = line.substr(0, pos);
|
||||
int type = std::stoi(line.substr(pos + 1));
|
||||
names.push_back(name);
|
||||
types.push_back(static_cast<ggml_type>(type));
|
||||
printf(" %s = %i\n", name.c_str(), type);
|
||||
}
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
// allocate memory for names and types
|
||||
override.names = new const char*[names.size()];
|
||||
override.types = new ggml_type[types.size()];
|
||||
override.count = names.size();
|
||||
|
||||
for (size_t i = 0; i < names.size(); ++i) {
|
||||
override.names[i] = strdup(names[i].c_str());
|
||||
override.types[i] = types[i];
|
||||
}
|
||||
|
||||
file.close();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 3) {
|
||||
usage(argv[0]);
|
||||
|
@ -352,13 +406,24 @@ int main(int argc, char ** argv) {
|
|||
fprintf(stderr, "%s: missing ftype\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (ftype_str == "COPY") {
|
||||
params.only_copy = true;
|
||||
}
|
||||
|
||||
if (ftype_str == "CUSTOM") {
|
||||
params.override_ftype = new llama_model_quantize_ftype_override;
|
||||
if(!read_custom_quant_config("quant.cfg", *params.override_ftype)) {
|
||||
fprintf(stderr, "%s: failed to read custom quant config file!\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
arg_idx++;
|
||||
}
|
||||
|
||||
|
|
28
llama.cpp
28
llama.cpp
|
@ -3610,6 +3610,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||
|
||||
// Custom quantization scheme
|
||||
case LLAMA_FTYPE_CUSTOM: return "CUSTOM";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
}
|
||||
|
@ -14195,9 +14198,13 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|||
|
||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||
ggml_type default_type;
|
||||
llama_ftype ftype = params->ftype;
|
||||
|
||||
switch (params->ftype) {
|
||||
llama_ftype ftype =
|
||||
params->override_ftype
|
||||
? params->override_ftype->default_ftype
|
||||
: params->ftype;
|
||||
|
||||
switch (ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
|
||||
|
@ -14279,7 +14286,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
// copy the KV pairs from the input file
|
||||
gguf_set_kv (ctx_out, ml.meta);
|
||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", params->ftype);
|
||||
// Remove split metadata
|
||||
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
||||
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
|
||||
|
@ -14417,6 +14424,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
new_type = params->output_tensor_type;
|
||||
}
|
||||
|
||||
// look up tensor name in type override map, if not found use default
|
||||
// type as determined by the ftype.
|
||||
if(params->override_ftype) {
|
||||
for (uint32_t i = 0; i < params->override_ftype->count; ++i) {
|
||||
if (strcmp(params->override_ftype->names[i], tensor->name) == 0) {
|
||||
//LLAMA_LOG_INFO("\n%s: %s %s ---> %s\n", __func__, tensor->name, ggml_type_name(new_type), ggml_type_name(params->override_ftype->types[i]));
|
||||
new_type = params->override_ftype->types[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we've decided to quantize to the same type the tensor is already
|
||||
// in then there's nothing to do.
|
||||
quantize = tensor->type != new_type;
|
||||
|
@ -14886,7 +14905,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|||
/*.only_copy =*/ false,
|
||||
/*.pure =*/ false,
|
||||
/*.imatrix =*/ nullptr,
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
/*.override_ftype =*/ nullptr
|
||||
};
|
||||
|
||||
return result;
|
||||
|
|
11
llama.h
11
llama.h
|
@ -122,6 +122,7 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||
LLAMA_FTYPE_CUSTOM = 32, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
@ -278,6 +279,13 @@ extern "C" {
|
|||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
typedef struct llama_model_quantize_ftype_override {
|
||||
enum llama_ftype default_ftype; // default type if not overriden
|
||||
uint32_t count; // number of overrides
|
||||
const char ** names; // tensor names
|
||||
enum ggml_type * types; // tensor type override
|
||||
} llama_model_quantize_custom_ftype;
|
||||
|
||||
// model quantization parameters
|
||||
typedef struct llama_model_quantize_params {
|
||||
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
|
@ -286,10 +294,11 @@ extern "C" {
|
|||
enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool only_copy; // only copy tensors - ftype,override_ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool pure; // quantize all tensors to the default type
|
||||
void * imatrix; // pointer to importance matrix data
|
||||
void * kv_overrides; // pointer to vector containing overrides
|
||||
struct llama_model_quantize_ftype_override * override_ftype; // custom quantization scheme
|
||||
} llama_model_quantize_params;
|
||||
|
||||
// grammar types
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue