llama : support requantizing models instead of only allowing quantization from 16/32bit (#1691)

* Add support for quantizing already quantized models

* Threaded dequantizing and f16 to f32 conversion

* Clean up thread blocks with spares calculation a bit

* Use std::runtime_error exceptions.
This commit is contained in:
Kerfuffle 2023-06-10 01:59:17 -06:00 committed by GitHub
parent ef3171d162
commit 4f0154b0ba
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 134 additions and 40 deletions

14
llama.h
View file

@ -115,7 +115,16 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
};
// model quantization parameters
typedef struct llama_model_quantize_params {
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
} llama_model_quantize_params;
LLAMA_API struct llama_context_params llama_context_default_params();
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
LLAMA_API bool llama_mmap_supported();
LLAMA_API bool llama_mlock_supported();
@ -137,14 +146,11 @@ extern "C" {
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
// TODO: not great API - very likely to change
// Returns 0 on success
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
enum llama_ftype ftype,
int nthread);
const llama_model_quantize_params * params);
// Apply a LoRA adapter to a loaded model
// path_base_model is the path to a higher quality model to use as a base for