minor verbose messages
This commit is contained in:
parent
abc77a7496
commit
5ecd645bce
2 changed files with 24 additions and 12 deletions
|
@ -147,6 +147,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
|
||||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
||||||
//
|
//
|
||||||
void usage(const char * executable) {
|
void usage(const char * executable) {
|
||||||
|
fprintf(stderr, "Falcon quantizer and ggml v3 converter. Important: currently the Q_K variants do not work with the 7B model (use Q_x for now with 7B)\n");
|
||||||
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
|
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
|
||||||
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||||
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||||
|
|
|
@ -689,6 +689,8 @@ struct llama_model_loader {
|
||||||
*ctx_size_p += ggml_tensor_overhead();
|
*ctx_size_p += ggml_tensor_overhead();
|
||||||
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
||||||
}
|
}
|
||||||
|
printf("calc_sizes(): %zu tensors, %zu bytes in context, %zu bytes mmapped\n",
|
||||||
|
tensors_map.tensors.size(), *ctx_size_p, *mmapped_size_p);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
||||||
|
@ -2386,19 +2388,19 @@ static void falcon_model_quantize_internal(const std::string & fname_inp, const
|
||||||
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
||||||
|
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
int n_attention_wv = 0;
|
// int n_attention_wv = 0;
|
||||||
int n_feed_forward_w2 = 0;
|
// int n_feed_forward_w2 = 0;
|
||||||
for (auto& tensor : model_loader->tensors_map.tensors) {
|
// for (auto& tensor : model_loader->tensors_map.tensors) {
|
||||||
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
// if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
||||||
++n_attention_wv;
|
// ++n_attention_wv;
|
||||||
}
|
// }
|
||||||
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
// else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||||
++n_feed_forward_w2;
|
// ++n_feed_forward_w2;
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
|
||||||
int i_attention_wv = 0;
|
// int i_attention_wv = 0;
|
||||||
int i_feed_forward_w2 = 0;
|
// int i_feed_forward_w2 = 0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
|
@ -2427,6 +2429,11 @@ static void falcon_model_quantize_internal(const std::string & fname_inp, const
|
||||||
quantize &= (tensor.ne.size() == 2);
|
quantize &= (tensor.ne.size() == 2);
|
||||||
quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
|
quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
|
||||||
quantize &= quantized_type != tensor.type;
|
quantize &= quantized_type != tensor.type;
|
||||||
|
if (tensor.name.find("mlp") == std::string::npos) {
|
||||||
|
// quantize = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
enum ggml_type new_type;
|
enum ggml_type new_type;
|
||||||
void * new_data;
|
void * new_data;
|
||||||
|
@ -2445,6 +2452,10 @@ static void falcon_model_quantize_internal(const std::string & fname_inp, const
|
||||||
// new_type = GGML_TYPE_Q6_K;
|
// new_type = GGML_TYPE_Q6_K;
|
||||||
// }
|
// }
|
||||||
// TODO falcon
|
// TODO falcon
|
||||||
|
// if (tensor.name.find("input_layernorm") != std::string::npos) {
|
||||||
|
// new_type = tensor.type;
|
||||||
|
// }
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
float * f32_data;
|
float * f32_data;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue