From e444b8e0c2662036111e123b234c59595775f216 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 7 Jan 2025 22:03:06 +0100 Subject: [PATCH] support mergekit-extract-lora --- convert_lora_to_gguf.py | 17 ++++++++------- src/llama-adapter.cpp | 48 +++++++++++++++-------------------------- src/llama-adapter.h | 9 +++++--- src/llama.cpp | 37 ++++++------------------------- 4 files changed, 39 insertions(+), 72 deletions(-) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 644b1aee4..83415ba8e 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -382,13 +382,13 @@ if __name__ == '__main__': if self.lazy: tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) - # note: lora_embedding is transposed by mergekit-extract-lora, so it's reversed here - is_lora_a = ".lora_A.weight" in name or ".lora_embedding_B" in name - is_lora_b = ".lora_B.weight" in name or ".lora_embedding_A" in name + # note: mergekit-extract-lora also adds token embeddings to the adapter + is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name + is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name if not is_lora_a and not is_lora_b: if ".base_layer.weight" in name: continue - # mergekit-extract-lora add these layernorm to the adapter + # mergekit-extract-lora add these layernorm to the adapter, we need to keep them if ".layernorm" or ".norm" in name: yield (base_name, tensor) continue @@ -398,10 +398,6 @@ if __name__ == '__main__': logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948") sys.exit(1) - # mergekit-extract-lora transposes this tensor, we need to transpose it back - if ".lora_embedding" in name: - tensor = tensor.T - if base_name in tensor_map: if is_lora_a: tensor_map[base_name].A = tensor @@ -437,6 +433,11 @@ if __name__ == '__main__': assert isinstance(dest_data, LoraTorchTensor) lora_a, lora_b = dest_data.get_lora_A_B() + # token_embd A and B are already transposed by mergekit-extract-lora + # we transpose A back again because it is used by llm_build_inp_embd() + if "token_embd.weight" in dest_name: + lora_a = lora_a.T + yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_b", lora_b) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 0ba2c3d1e..f5a6c24ec 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -243,8 +243,9 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char ab_map[name].b = cur; } } else if (str_endswith(name, "_norm.weight")) { - // norm only has 1 dim, so tensor b == nullptr - ab_map[name] = llama_lora_weight(cur); + // TODO: add support for norm vector + // for now, we don't really care because most adapters still work fine without it + continue; } else { throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); } @@ -254,9 +255,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char for (auto & it : ab_map) { const std::string & name = it.first; llama_lora_weight & w = it.second; - if (w.is_norm) { - continue; - } + bool is_token_embd = str_endswith(name, "token_embd.weight"); if (!w.a || !w.b) { throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); @@ -270,11 +269,18 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); // validate tensor shape - if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { - throw std::runtime_error("tensor '" + name + "' has incorrect shape"); - } - if (w.a->ne[1] != w.b->ne[0]) { - throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); + if (is_token_embd) { + // expect B to be transposed, see llm_build_inp_embd() + if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) { + throw std::runtime_error("tensor '" + name + "' has incorrect shape"); + } + } else { + if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { + throw std::runtime_error("tensor '" + name + "' has incorrect shape"); + } + if (w.a->ne[1] != w.b->ne[0]) { + throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); + } } // save tensor to adapter @@ -285,24 +291,6 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b); } - // add norm vectors - for (auto & it : ab_map) { - const std::string & name = it.first; - llama_lora_weight & w = it.second; - if (w.is_norm) { - GGML_ASSERT(w.a != nullptr); - // device buft and device ctx - auto * model_tensor = llama_model_get_tensor(model, name.c_str()); - if (!model_tensor) { - throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); - } - struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); - struct ggml_tensor * tensor_norm = ggml_dup_tensor(dev_ctx, w.a); - ggml_set_name(tensor_norm, w.a->name); - adapter.ab_map[it.first] = llama_lora_weight(tensor_norm); - } - } - // allocate tensors / buffers and zero { adapter.ctxs.reserve(ctx_map.size()); @@ -335,9 +323,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char auto orig = ab_map[it.first]; auto dev = it.second; set_tensor(orig.a, dev.a); - if (!dev.is_norm) { - set_tensor(orig.b, dev.b); - } + set_tensor(orig.b, dev.b); } } diff --git a/src/llama-adapter.h b/src/llama-adapter.h index aff0d8c0b..3448656b1 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -45,11 +45,14 @@ struct llama_lora_weight { struct ggml_tensor * a = nullptr; struct ggml_tensor * b = nullptr; - // note: norm only has 1 dim, so tensor b == nullptr - bool is_norm = false; // is this a norm vector? (e.g. _norm.weight) + // get actual scale based on rank and alpha + float get_scale(float alpha, float adapter_scale) { + const float rank = (float) b->ne[0]; + const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale; + return scale; + } llama_lora_weight() = default; - llama_lora_weight(struct ggml_tensor * a) : a(a), is_norm(true) {} llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {} }; diff --git a/src/llama.cpp b/src/llama.cpp index 9c52afeba..d79b2ab13 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2545,8 +2545,6 @@ static struct ggml_tensor * llm_build_inp_embd( ggml_set_input(lctx.inp_tokens); inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); - //printf("tok_embd shape: %d x %d\n", tok_embd->ne[0], tok_embd->ne[1]); - //printf("inpL shape: %d x %d\n", inpL->ne[0], inpL->ne[1]); // apply lora for embedding tokens if needed for (auto & it : lctx.lora_adapters) { @@ -2554,18 +2552,13 @@ static struct ggml_tensor * llm_build_inp_embd( if (lora == nullptr) { continue; } - const float alpha = it.first->alpha; - const float rank = (float) lora->b->ne[0]; - const float scale = alpha ? it.second * alpha / rank : it.second; - auto ss = ggml_get_rows(ctx, lora->b, lctx.inp_tokens); - //printf("a shape: %d x %d\n", lora->a->ne[0], lora->a->ne[1]); - //printf("b shape: %d x %d\n", lora->b->ne[0], lora->b->ne[1]); - //printf("ss shape: %d x %d\n", ss->ne[0], ss->ne[1]); + const float adapter_scale = it.second; + const float scale = lora->get_scale(it.first->alpha, adapter_scale); struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat( - ctx, ss, ggml_transpose(ctx, lora->a) + ctx, lora->b, // non-transposed lora_b + ggml_get_rows(ctx, lora->a, lctx.inp_tokens) ), scale); - //printf("inpL_delta shape: %d x %d\n", inpL_delta->ne[0], inpL_delta->ne[1]); - inpL = ggml_add(ctx, inpL, ggml_cont(ctx, ggml_transpose(ctx, inpL_delta))); + inpL = ggml_add(ctx, inpL, inpL_delta); } } else { lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens); @@ -3919,17 +3912,9 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * attn_norm = model.layers[il].attn_norm; - for (auto & it : lctx.lora_adapters) { - struct llama_lora_weight * lora = it.first->get_weight(model.layers[il].attn_norm); - if (lora && lora->is_norm) { - attn_norm = ggml_add(ctx0, attn_norm, ggml_scale(ctx0, lora->a, 0.5)); - } - } - // norm cur = llm_build_norm(ctx0, inpL, hparams, - attn_norm, NULL, + model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); cb(cur, "attn_norm", il); @@ -3998,16 +3983,8 @@ struct llm_build_context { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { - struct ggml_tensor * ffn_norm = model.layers[il].ffn_norm; - // for (auto & it : lctx.lora_adapters) { - // struct llama_lora_weight * lora = it.first->get_weight(ffn_norm); - // if (lora && lora->is_norm) { - // ffn_norm = ggml_add(ctx0, ffn_norm, lora->a); - // } - // } - cur = llm_build_norm(ctx0, ffn_inp, hparams, - ffn_norm, NULL, + model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il);