llama : allow overriding GGUF metadata when loading model (#4092)

* feat: Allow overriding GGUF metadata when loading model * Fix the one time GCC is stricter than clang about something * Step1 * Refactor... basically everything! * Nuke obsolete GetArrayLen struct * simplify std::string specialization * Various cleanups Add informational output when overrides are applied Warn user when an override with the wrong type is specified * Fix broken logic for parsing bool KV overrides Fix issue where overrides didn't apply when key missing in GGUF metadata Resolve merge changes * llama : rearrange model params * Update new GET_KEY call Add note that metadata KV overrides aren't reflected in initial metadata KV info dump --------- Co-authored-by: cebtenzzre <cebtenzzre@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-12-05 10:19:18 -07:00 · 2023-12-05 10:19:18 -07:00 · 5aa365d88f
commit 5aa365d88f
parent 52c8bc3cf3
4 changed files with 361 additions and 86 deletions
--- a/llama.h
+++ b/llama.h
@ -158,6 +158,22 @@ extern "C" {
        llama_seq_id all_seq_id; // used if seq_id == NULL
    } llama_batch;

+    enum llama_model_kv_override_type {
+        LLAMA_KV_OVERRIDE_INT,
+        LLAMA_KV_OVERRIDE_FLOAT,
+        LLAMA_KV_OVERRIDE_BOOL,
+    };
+
+    struct llama_model_kv_override {
+        char key[128];
+        enum llama_model_kv_override_type tag;
+        union {
+            int64_t int_value;
+            double float_value;
+            bool bool_value;
+        };
+    };
+
    struct llama_model_params {
        int32_t n_gpu_layers; // number of layers to store in VRAM
        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
@ -165,9 +181,13 @@ extern "C" {

        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;
+
        // context pointer passed to the progress callback
        void * progress_callback_user_data;

+        // override key-value pairs of the model meta data
+        const struct llama_model_kv_override * kv_overrides;
+
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only; // only load the vocabulary, no weights
        bool use_mmap;   // use mmap if possible