quantize: be able to specify the output tensor type

2024-03-22 16:11:34 +02:00 · 2024-03-22 16:11:34 +02:00 · 7883796f71
commit 7883796f71
parent b2075fd6a5
3 changed files with 34 additions and 10 deletions
--- a/llama.h
+++ b/llama.h
@ -277,6 +277,7 @@ extern "C" {
    typedef struct llama_model_quantize_params {
        int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
        enum llama_ftype ftype;      // quantize to this llama_ftype
+        enum ggml_type output_tensor_type; // output tensor type
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor; // quantize output.weight
        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored