(wip) support mergekit-extracted lora

This commit is contained in:
Xuan Son Nguyen 2025-01-07 00:35:16 +01:00
parent dc7cef9f37
commit 93fbfd022c
4 changed files with 102 additions and 6 deletions

View file

@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
base_name = lora_tensor_name.replace("base_model.model.", "") base_name = lora_tensor_name.replace("base_model.model.", "")
base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_A.weight", ".weight")
base_name = base_name.replace(".lora_B.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight")
# models produced by mergekit-extract-lora have token embeddings in the adapter
base_name = base_name.replace(".lora_embedding_A", ".weight")
base_name = base_name.replace(".lora_embedding_B", ".weight")
return base_name return base_name
@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace:
"--base", type=Path, "--base", type=Path,
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config", help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
) )
parser.add_argument(
"--base-model-id", type=str,
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
)
parser.add_argument( parser.add_argument(
"lora_path", type=Path, "lora_path", type=Path,
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
@ -290,6 +297,7 @@ if __name__ == '__main__':
dir_base_model: Path | None = args.base dir_base_model: Path | None = args.base
dir_lora: Path = args.lora_path dir_lora: Path = args.lora_path
base_model_id: str | None = args.base_model_id
lora_config = dir_lora / "adapter_config.json" lora_config = dir_lora / "adapter_config.json"
input_model = dir_lora / "adapter_model.safetensors" input_model = dir_lora / "adapter_model.safetensors"
@ -313,7 +321,10 @@ if __name__ == '__main__':
lparams: dict[str, Any] = json.load(f) lparams: dict[str, Any] = json.load(f)
# load base model # load base model
if dir_base_model is None: if base_model_id is not None:
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
hparams = load_hparams_from_hf(base_model_id)
elif dir_base_model is None:
if "base_model_name_or_path" in lparams: if "base_model_name_or_path" in lparams:
model_id = lparams["base_model_name_or_path"] model_id = lparams["base_model_name_or_path"]
logger.info(f"Loading base model from Hugging Face: {model_id}") logger.info(f"Loading base model from Hugging Face: {model_id}")
@ -371,17 +382,26 @@ if __name__ == '__main__':
if self.lazy: if self.lazy:
tensor = LazyTorchTensor.from_eager(tensor) tensor = LazyTorchTensor.from_eager(tensor)
base_name = get_base_tensor_name(name) base_name = get_base_tensor_name(name)
is_lora_a = ".lora_A.weight" in name # note: lora_embedding is transposed by mergekit-extract-lora, so it's reversed here
is_lora_b = ".lora_B.weight" in name is_lora_a = ".lora_A.weight" in name or ".lora_embedding_B" in name
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_A" in name
if not is_lora_a and not is_lora_b: if not is_lora_a and not is_lora_b:
if ".base_layer.weight" in name: if ".base_layer.weight" in name:
continue continue
# mergekit-extract-lora add these layernorm to the adapter
if ".layernorm" or ".norm" in name:
yield (base_name, tensor)
continue
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
if ".embed_tokens.weight" in name or ".lm_head.weight" in name: if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning") logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948") logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
sys.exit(1) sys.exit(1)
# mergekit-extract-lora transposes this tensor, we need to transpose it back
if ".lora_embedding" in name:
tensor = tensor.T
if base_name in tensor_map: if base_name in tensor_map:
if is_lora_a: if is_lora_a:
tensor_map[base_name].A = tensor tensor_map[base_name].A = tensor
@ -407,6 +427,13 @@ if __name__ == '__main__':
if name == "lm_head.weight" and len(dest) == 0: if name == "lm_head.weight" and len(dest) == 0:
raise ValueError("lm_head is present in adapter, but is ignored in base model") raise ValueError("lm_head is present in adapter, but is ignored in base model")
for dest_name, dest_data in dest: for dest_name, dest_data in dest:
# mergekit-extract-lora add these layernorm to the adapter
if "_norm" in dest_name:
assert dest_data.dim() == 1
yield (dest_name, dest_data)
continue
# otherwise, we must get the lora_A and lora_B tensors
assert isinstance(dest_data, LoraTorchTensor) assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B() lora_a, lora_b = dest_data.get_lora_A_B()

View file

@ -242,6 +242,9 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
} else { } else {
ab_map[name].b = cur; ab_map[name].b = cur;
} }
} else if (str_endswith(name, "_norm.weight")) {
// norm only has 1 dim, so tensor b == nullptr
ab_map[name] = llama_lora_weight(cur);
} else { } else {
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
} }
@ -251,6 +254,9 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
for (auto & it : ab_map) { for (auto & it : ab_map) {
const std::string & name = it.first; const std::string & name = it.first;
llama_lora_weight & w = it.second; llama_lora_weight & w = it.second;
if (w.is_norm) {
continue;
}
if (!w.a || !w.b) { if (!w.a || !w.b) {
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
@ -279,6 +285,24 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b); adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
} }
// add norm vectors
for (auto & it : ab_map) {
const std::string & name = it.first;
llama_lora_weight & w = it.second;
if (w.is_norm) {
GGML_ASSERT(w.a != nullptr);
// device buft and device ctx
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
if (!model_tensor) {
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
}
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
struct ggml_tensor * tensor_norm = ggml_dup_tensor(dev_ctx, w.a);
ggml_set_name(tensor_norm, w.a->name);
adapter.ab_map[it.first] = llama_lora_weight(tensor_norm);
}
}
// allocate tensors / buffers and zero // allocate tensors / buffers and zero
{ {
adapter.ctxs.reserve(ctx_map.size()); adapter.ctxs.reserve(ctx_map.size());
@ -311,7 +335,9 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
auto orig = ab_map[it.first]; auto orig = ab_map[it.first];
auto dev = it.second; auto dev = it.second;
set_tensor(orig.a, dev.a); set_tensor(orig.a, dev.a);
set_tensor(orig.b, dev.b); if (!dev.is_norm) {
set_tensor(orig.b, dev.b);
}
} }
} }

View file

@ -45,7 +45,11 @@ struct llama_lora_weight {
struct ggml_tensor * a = nullptr; struct ggml_tensor * a = nullptr;
struct ggml_tensor * b = nullptr; struct ggml_tensor * b = nullptr;
// note: norm only has 1 dim, so tensor b == nullptr
bool is_norm = false; // is this a norm vector? (e.g. _norm.weight)
llama_lora_weight() = default; llama_lora_weight() = default;
llama_lora_weight(struct ggml_tensor * a) : a(a), is_norm(true) {}
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {} llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
}; };

View file

@ -2545,6 +2545,28 @@ static struct ggml_tensor * llm_build_inp_embd(
ggml_set_input(lctx.inp_tokens); ggml_set_input(lctx.inp_tokens);
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
//printf("tok_embd shape: %d x %d\n", tok_embd->ne[0], tok_embd->ne[1]);
//printf("inpL shape: %d x %d\n", inpL->ne[0], inpL->ne[1]);
// apply lora for embedding tokens if needed
for (auto & it : lctx.lora_adapters) {
struct llama_lora_weight * lora = it.first->get_weight(tok_embd);
if (lora == nullptr) {
continue;
}
const float alpha = it.first->alpha;
const float rank = (float) lora->b->ne[0];
const float scale = alpha ? it.second * alpha / rank : it.second;
auto ss = ggml_get_rows(ctx, lora->b, lctx.inp_tokens);
//printf("a shape: %d x %d\n", lora->a->ne[0], lora->a->ne[1]);
//printf("b shape: %d x %d\n", lora->b->ne[0], lora->b->ne[1]);
//printf("ss shape: %d x %d\n", ss->ne[0], ss->ne[1]);
struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
ctx, ss, ggml_transpose(ctx, lora->a)
), scale);
//printf("inpL_delta shape: %d x %d\n", inpL_delta->ne[0], inpL_delta->ne[1]);
inpL = ggml_add(ctx, inpL, ggml_cont(ctx, ggml_transpose(ctx, inpL_delta)));
}
} else { } else {
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens); lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
inpL = lctx.inp_embd; inpL = lctx.inp_embd;
@ -3897,9 +3919,17 @@ struct llm_build_context {
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
struct ggml_tensor * attn_norm = model.layers[il].attn_norm;
for (auto & it : lctx.lora_adapters) {
struct llama_lora_weight * lora = it.first->get_weight(model.layers[il].attn_norm);
if (lora && lora->is_norm) {
attn_norm = ggml_add(ctx0, attn_norm, ggml_scale(ctx0, lora->a, 0.5));
}
}
// norm // norm
cur = llm_build_norm(ctx0, inpL, hparams, cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL, attn_norm, NULL,
LLM_NORM_RMS, cb, il); LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il); cb(cur, "attn_norm", il);
@ -3967,8 +3997,17 @@ struct llm_build_context {
// feed-forward network // feed-forward network
if (model.layers[il].ffn_gate_inp == nullptr) { if (model.layers[il].ffn_gate_inp == nullptr) {
struct ggml_tensor * ffn_norm = model.layers[il].ffn_norm;
// for (auto & it : lctx.lora_adapters) {
// struct llama_lora_weight * lora = it.first->get_weight(ffn_norm);
// if (lora && lora->is_norm) {
// ffn_norm = ggml_add(ctx0, ffn_norm, lora->a);
// }
// }
cur = llm_build_norm(ctx0, ffn_inp, hparams, cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL, ffn_norm, NULL,
LLM_NORM_RMS, cb, il); LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);