diff --git a/examples/falcon_quantize/quantize.cpp b/examples/falcon_quantize/quantize.cpp index 3af7f3ffe..dda5b509e 100644 --- a/examples/falcon_quantize/quantize.cpp +++ b/examples/falcon_quantize/quantize.cpp @@ -147,6 +147,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std: // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] // void usage(const char * executable) { + fprintf(stderr, "Falcon quantizer and ggml v3 converter. Important: currently the Q_K variants do not work with the 7B model (use Q_x for now with 7B)\n"); fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable); fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); diff --git a/libfalcon.cpp b/libfalcon.cpp index cdb38eba0..156cd23dc 100644 --- a/libfalcon.cpp +++ b/libfalcon.cpp @@ -689,6 +689,8 @@ struct llama_model_loader { *ctx_size_p += ggml_tensor_overhead(); *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; } + printf("calc_sizes(): %zu tensors, %zu bytes in context, %zu bytes mmapped\n", + tensors_map.tensors.size(), *ctx_size_p, *mmapped_size_p); } struct ggml_tensor * get_tensor(const std::string & name, const std::vector & ne, ggml_backend backend) { @@ -2386,19 +2388,19 @@ static void falcon_model_quantize_internal(const std::string & fname_inp, const llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype); #ifdef GGML_USE_K_QUANTS - int n_attention_wv = 0; - int n_feed_forward_w2 = 0; - for (auto& tensor : model_loader->tensors_map.tensors) { - if (tensor.name.find("attention.wv.weight") != std::string::npos) { - ++n_attention_wv; - } - else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { - ++n_feed_forward_w2; - } - } + // int n_attention_wv = 0; + // int n_feed_forward_w2 = 0; + // for (auto& tensor : model_loader->tensors_map.tensors) { + // if (tensor.name.find("attention.wv.weight") != std::string::npos) { + // ++n_attention_wv; + // } + // else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { + // ++n_feed_forward_w2; + // } + // } - int i_attention_wv = 0; - int i_feed_forward_w2 = 0; + // int i_attention_wv = 0; + // int i_feed_forward_w2 = 0; #endif size_t total_size_org = 0; @@ -2427,6 +2429,11 @@ static void falcon_model_quantize_internal(const std::string & fname_inp, const quantize &= (tensor.ne.size() == 2); quantize &= params->quantize_output_tensor || tensor.name != "output.weight"; quantize &= quantized_type != tensor.type; + if (tensor.name.find("mlp") == std::string::npos) { + // quantize = false; + } + + enum ggml_type new_type; void * new_data; @@ -2445,6 +2452,10 @@ static void falcon_model_quantize_internal(const std::string & fname_inp, const // new_type = GGML_TYPE_Q6_K; // } // TODO falcon + // if (tensor.name.find("input_layernorm") != std::string::npos) { + // new_type = tensor.type; + // } + #endif float * f32_data;