diff --git a/common/common.cpp b/common/common.cpp index 4cc71179c..dac152c4f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -684,7 +684,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--lora") { CHECK_ARG - params.lora_adapter.emplace_back(argv[i], 0.0f); + params.lora_adapter.emplace_back(argv[i], 1.0f); return true; } if (arg == "--lora-scaled") { @@ -2089,9 +2089,6 @@ std::tuple llama_init_from_gpt_par llama_free_model(model); return std::make_tuple(nullptr, nullptr); } - if (lora_scale == 0.0f) { - lora_scale = llama_lora_adapter_get_default_scale(adapter); - } llama_lora_adapter_set(lctx, adapter, lora_scale); } diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index be0b6f272..71d3e57f5 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -366,11 +366,9 @@ if __name__ == '__main__': lparams: dict[str, Any] = json.load(f) alpha = lparams["lora_alpha"] - rank = lparams["r"] model_instance.gguf_writer.add_string("training.type", "finetune_lora") model_instance.gguf_writer.add_float32("training.lora.alpha", float(alpha)) - model_instance.gguf_writer.add_float32("training.lora.scale", float(alpha) / float(rank)) model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) logger.info("Exporting model...") diff --git a/include/llama.h b/include/llama.h index 01ea88466..c57d21f0c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -513,33 +513,12 @@ extern "C" { const char * fname_out, const llama_model_quantize_params * params); - // Apply a loaded control vector to a llama_context, or if data is NULL, clear - // the currently loaded vector. - // n_embd should be the size of a single layer's control, and data should point - // to an n_embd x n_layers buffer starting from layer 1. - // il_start and il_end are the layer range the vector should apply to (both inclusive) - // See llama_control_vector_load in common to load a control vector. - LLAMA_API int32_t llama_control_vector_apply( - struct llama_context * lctx, - const float * data, - size_t len, - int32_t n_embd, - int32_t il_start, - int32_t il_end); - - // - // LoRA - // - // Load a LoRA adapter from file // The loaded adapter will be associated to the given model, and will be free when the model is deleted LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( struct llama_model * model, const char * path_lora); - // Get default scale of an adapter - LLAMA_API float llama_lora_adapter_get_default_scale(struct llama_lora_adapter * adapter); - // Add a loaded LoRA adapter to given context // This will not modify model's weight LLAMA_API int32_t llama_lora_adapter_set( @@ -557,6 +536,20 @@ extern "C" { // Note: loaded adapters will be free when the associated model is deleted LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter); + // Apply a loaded control vector to a llama_context, or if data is NULL, clear + // the currently loaded vector. + // n_embd should be the size of a single layer's control, and data should point + // to an n_embd x n_layers buffer starting from layer 1. + // il_start and il_end are the layer range the vector should apply to (both inclusive) + // See llama_control_vector_load in common to load a control vector. + LLAMA_API int32_t llama_control_vector_apply( + struct llama_context * lctx, + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end); + // // KV cache // diff --git a/src/llama.cpp b/src/llama.cpp index 4c77b1014..d5a7bb62b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -380,7 +380,6 @@ enum llm_kv { LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_LORA_ALPHA, - LLM_KV_TRAINING_LORA_SCALE, }; static const std::map LLM_KV_NAMES = { @@ -477,7 +476,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TRAINING_TYPE, "training.type" }, { LLM_KV_TRAINING_LORA_ALPHA, "training.lora.alpha" }, - { LLM_KV_TRAINING_LORA_SCALE, "training.lora.scale" }, }; struct LLM_KV { @@ -2853,7 +2851,6 @@ struct llama_lora_adapter { std::vector bufs; float alpha; - float scale; // default scale llama_lora_adapter(struct llama_model * base_model): base_model(base_model) { base_model->lora_adapters.insert(this); @@ -18581,7 +18578,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { - LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); + LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora); ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { @@ -18618,7 +18615,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c } adapter.alpha = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_ALPHA)); - adapter.scale = get_kv_f32(llm_kv(LLM_KV_TRAINING_LORA_SCALE)); } int n_tensors = gguf_get_n_tensors(ctx_gguf); @@ -18753,10 +18749,6 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c ggml_free(ctx); } -float llama_lora_adapter_get_default_scale(struct llama_lora_adapter * adapter) { - return adapter->scale; -} - int32_t llama_lora_adapter_set( struct llama_context * ctx, struct llama_lora_adapter * adapter,