llama : support requantizing models instead of only allowing quantization from 16/32bit (#1691)

* Add support for quantizing already quantized models * Threaded dequantizing and f16 to f32 conversion * Clean up thread blocks with spares calculation a bit * Use std::runtime_error exceptions.
2023-06-10 01:59:17 -06:00 · 2023-06-10 01:59:17 -06:00 · 4f0154b0ba
commit 4f0154b0ba
parent ef3171d162
3 changed files with 134 additions and 40 deletions
--- a/llama.h
+++ b/llama.h
@ -115,7 +115,16 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
    };

+    // model quantization parameters
+    typedef struct llama_model_quantize_params {
+        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype   ftype;    // quantize to this llama_ftype
+        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor; // quantize output.weight
+    } llama_model_quantize_params;
+
    LLAMA_API struct llama_context_params llama_context_default_params();
+    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();

    LLAMA_API bool llama_mmap_supported();
    LLAMA_API bool llama_mlock_supported();
@ -137,14 +146,11 @@ extern "C" {
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);

-    // TODO: not great API - very likely to change
    // Returns 0 on success
-    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
-      enum llama_ftype   ftype,
-            int          nthread);
+            const llama_model_quantize_params * params);

    // Apply a LoRA adapter to a loaded model
    // path_base_model is the path to a higher quality model to use as a base for