quantize: be able to specify the output tensor type

This commit is contained in:
Iwan Kawrakow 2024-03-22 16:11:34 +02:00
parent b2075fd6a5
commit 7883796f71
3 changed files with 34 additions and 10 deletions

View file

@ -277,6 +277,7 @@ extern "C" {
typedef struct llama_model_quantize_params {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored