minor verbose messages
This commit is contained in:
parent
abc77a7496
commit
5ecd645bce
2 changed files with 24 additions and 12 deletions
|
@ -147,6 +147,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
|
|||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
|
||||
//
|
||||
void usage(const char * executable) {
|
||||
fprintf(stderr, "Falcon quantizer and ggml v3 converter. Important: currently the Q_K variants do not work with the 7B model (use Q_x for now with 7B)\n");
|
||||
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
|
||||
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
|
|
|
@ -689,6 +689,8 @@ struct llama_model_loader {
|
|||
*ctx_size_p += ggml_tensor_overhead();
|
||||
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
||||
}
|
||||
printf("calc_sizes(): %zu tensors, %zu bytes in context, %zu bytes mmapped\n",
|
||||
tensors_map.tensors.size(), *ctx_size_p, *mmapped_size_p);
|
||||
}
|
||||
|
||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
||||
|
@ -2386,19 +2388,19 @@ static void falcon_model_quantize_internal(const std::string & fname_inp, const
|
|||
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
||||
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
int n_attention_wv = 0;
|
||||
int n_feed_forward_w2 = 0;
|
||||
for (auto& tensor : model_loader->tensors_map.tensors) {
|
||||
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
||||
++n_attention_wv;
|
||||
}
|
||||
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||
++n_feed_forward_w2;
|
||||
}
|
||||
}
|
||||
// int n_attention_wv = 0;
|
||||
// int n_feed_forward_w2 = 0;
|
||||
// for (auto& tensor : model_loader->tensors_map.tensors) {
|
||||
// if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
||||
// ++n_attention_wv;
|
||||
// }
|
||||
// else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||
// ++n_feed_forward_w2;
|
||||
// }
|
||||
// }
|
||||
|
||||
int i_attention_wv = 0;
|
||||
int i_feed_forward_w2 = 0;
|
||||
// int i_attention_wv = 0;
|
||||
// int i_feed_forward_w2 = 0;
|
||||
#endif
|
||||
|
||||
size_t total_size_org = 0;
|
||||
|
@ -2427,6 +2429,11 @@ static void falcon_model_quantize_internal(const std::string & fname_inp, const
|
|||
quantize &= (tensor.ne.size() == 2);
|
||||
quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
|
||||
quantize &= quantized_type != tensor.type;
|
||||
if (tensor.name.find("mlp") == std::string::npos) {
|
||||
// quantize = false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
enum ggml_type new_type;
|
||||
void * new_data;
|
||||
|
@ -2445,6 +2452,10 @@ static void falcon_model_quantize_internal(const std::string & fname_inp, const
|
|||
// new_type = GGML_TYPE_Q6_K;
|
||||
// }
|
||||
// TODO falcon
|
||||
// if (tensor.name.find("input_layernorm") != std::string::npos) {
|
||||
// new_type = tensor.type;
|
||||
// }
|
||||
|
||||
#endif
|
||||
|
||||
float * f32_data;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue