lora : improve compat with mergekit-extract-lora
(#11131)
* (wip) support mergekit-extracted lora * support mergekit-extract-lora * use lora->get_scale * correct comment * correct norm name & condition * add some hints
This commit is contained in:
parent
c07d437bbd
commit
4d2b3d8804
4 changed files with 74 additions and 12 deletions
|
@ -242,6 +242,10 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||
} else {
|
||||
ab_map[name].b = cur;
|
||||
}
|
||||
} else if (str_endswith(name, "_norm.weight")) {
|
||||
// TODO: add support for norm vector
|
||||
// for now, we don't really care because most adapters still work fine without it
|
||||
continue;
|
||||
} else {
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
|
||||
}
|
||||
|
@ -251,6 +255,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||
for (auto & it : ab_map) {
|
||||
const std::string & name = it.first;
|
||||
llama_lora_weight & w = it.second;
|
||||
bool is_token_embd = str_endswith(name, "token_embd.weight");
|
||||
|
||||
if (!w.a || !w.b) {
|
||||
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
|
||||
|
@ -259,16 +264,23 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||
// device buft and device ctx
|
||||
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
|
||||
if (!model_tensor) {
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
|
||||
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
||||
}
|
||||
|
||||
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
|
||||
// validate tensor shape
|
||||
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
|
||||
}
|
||||
if (w.a->ne[1] != w.b->ne[0]) {
|
||||
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
||||
if (is_token_embd) {
|
||||
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
|
||||
if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
||||
}
|
||||
} else {
|
||||
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
||||
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
||||
}
|
||||
if (w.a->ne[1] != w.b->ne[0]) {
|
||||
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
||||
}
|
||||
}
|
||||
|
||||
// save tensor to adapter
|
||||
|
|
|
@ -45,6 +45,13 @@ struct llama_lora_weight {
|
|||
struct ggml_tensor * a = nullptr;
|
||||
struct ggml_tensor * b = nullptr;
|
||||
|
||||
// get actual scale based on rank and alpha
|
||||
float get_scale(float alpha, float adapter_scale) {
|
||||
const float rank = (float) b->ne[0];
|
||||
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
|
||||
return scale;
|
||||
}
|
||||
|
||||
llama_lora_weight() = default;
|
||||
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
||||
};
|
||||
|
|
|
@ -2545,6 +2545,21 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|||
ggml_set_input(lctx.inp_tokens);
|
||||
|
||||
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
||||
|
||||
// apply lora for embedding tokens if needed
|
||||
for (auto & it : lctx.lora_adapters) {
|
||||
struct llama_lora_weight * lora = it.first->get_weight(tok_embd);
|
||||
if (lora == nullptr) {
|
||||
continue;
|
||||
}
|
||||
const float adapter_scale = it.second;
|
||||
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
|
||||
struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
|
||||
ctx, lora->b, // non-transposed lora_b
|
||||
ggml_get_rows(ctx, lora->a, lctx.inp_tokens)
|
||||
), scale);
|
||||
inpL = ggml_add(ctx, inpL, inpL_delta);
|
||||
}
|
||||
} else {
|
||||
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
|
||||
inpL = lctx.inp_embd;
|
||||
|
@ -2617,9 +2632,8 @@ static struct ggml_tensor * llm_build_lora_mm(
|
|||
if (lora == nullptr) {
|
||||
continue;
|
||||
}
|
||||
const float alpha = it.first->alpha;
|
||||
const float rank = (float) lora->b->ne[0];
|
||||
const float scale = alpha ? it.second * alpha / rank : it.second;
|
||||
const float adapter_scale = it.second;
|
||||
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
|
||||
struct ggml_tensor * ab_cur = ggml_mul_mat(
|
||||
ctx0, lora->b,
|
||||
ggml_mul_mat(ctx0, lora->a, cur)
|
||||
|
@ -3967,6 +3981,7 @@ struct llm_build_context {
|
|||
|
||||
// feed-forward network
|
||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue