minor verbose messages

This commit is contained in:
John 2023-06-18 02:10:26 +02:00
parent abc77a7496
commit 5ecd645bce
2 changed files with 24 additions and 12 deletions

View file

@ -147,6 +147,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
// //
void usage(const char * executable) { void usage(const char * executable) {
fprintf(stderr, "Falcon quantizer and ggml v3 converter. Important: currently the Q_K variants do not work with the 7B model (use Q_x for now with 7B)\n");
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable); fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");

View file

@ -689,6 +689,8 @@ struct llama_model_loader {
*ctx_size_p += ggml_tensor_overhead(); *ctx_size_p += ggml_tensor_overhead();
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
} }
printf("calc_sizes(): %zu tensors, %zu bytes in context, %zu bytes mmapped\n",
tensors_map.tensors.size(), *ctx_size_p, *mmapped_size_p);
} }
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) { struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
@ -2386,19 +2388,19 @@ static void falcon_model_quantize_internal(const std::string & fname_inp, const
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype); llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
#ifdef GGML_USE_K_QUANTS #ifdef GGML_USE_K_QUANTS
int n_attention_wv = 0; // int n_attention_wv = 0;
int n_feed_forward_w2 = 0; // int n_feed_forward_w2 = 0;
for (auto& tensor : model_loader->tensors_map.tensors) { // for (auto& tensor : model_loader->tensors_map.tensors) {
if (tensor.name.find("attention.wv.weight") != std::string::npos) { // if (tensor.name.find("attention.wv.weight") != std::string::npos) {
++n_attention_wv; // ++n_attention_wv;
} // }
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { // else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
++n_feed_forward_w2; // ++n_feed_forward_w2;
} // }
} // }
int i_attention_wv = 0; // int i_attention_wv = 0;
int i_feed_forward_w2 = 0; // int i_feed_forward_w2 = 0;
#endif #endif
size_t total_size_org = 0; size_t total_size_org = 0;
@ -2427,6 +2429,11 @@ static void falcon_model_quantize_internal(const std::string & fname_inp, const
quantize &= (tensor.ne.size() == 2); quantize &= (tensor.ne.size() == 2);
quantize &= params->quantize_output_tensor || tensor.name != "output.weight"; quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
quantize &= quantized_type != tensor.type; quantize &= quantized_type != tensor.type;
if (tensor.name.find("mlp") == std::string::npos) {
// quantize = false;
}
enum ggml_type new_type; enum ggml_type new_type;
void * new_data; void * new_data;
@ -2445,6 +2452,10 @@ static void falcon_model_quantize_internal(const std::string & fname_inp, const
// new_type = GGML_TYPE_Q6_K; // new_type = GGML_TYPE_Q6_K;
// } // }
// TODO falcon // TODO falcon
// if (tensor.name.find("input_layernorm") != std::string::npos) {
// new_type = tensor.type;
// }
#endif #endif
float * f32_data; float * f32_data;