(wip) support mergekit-extracted lora
This commit is contained in:
parent
dc7cef9f37
commit
93fbfd022c
4 changed files with 102 additions and 6 deletions
|
@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
|
||||||
base_name = lora_tensor_name.replace("base_model.model.", "")
|
base_name = lora_tensor_name.replace("base_model.model.", "")
|
||||||
base_name = base_name.replace(".lora_A.weight", ".weight")
|
base_name = base_name.replace(".lora_A.weight", ".weight")
|
||||||
base_name = base_name.replace(".lora_B.weight", ".weight")
|
base_name = base_name.replace(".lora_B.weight", ".weight")
|
||||||
|
# models produced by mergekit-extract-lora have token embeddings in the adapter
|
||||||
|
base_name = base_name.replace(".lora_embedding_A", ".weight")
|
||||||
|
base_name = base_name.replace(".lora_embedding_B", ".weight")
|
||||||
return base_name
|
return base_name
|
||||||
|
|
||||||
|
|
||||||
|
@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace:
|
||||||
"--base", type=Path,
|
"--base", type=Path,
|
||||||
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
|
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--base-model-id", type=str,
|
||||||
|
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"lora_path", type=Path,
|
"lora_path", type=Path,
|
||||||
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
|
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
|
||||||
|
@ -290,6 +297,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
dir_base_model: Path | None = args.base
|
dir_base_model: Path | None = args.base
|
||||||
dir_lora: Path = args.lora_path
|
dir_lora: Path = args.lora_path
|
||||||
|
base_model_id: str | None = args.base_model_id
|
||||||
lora_config = dir_lora / "adapter_config.json"
|
lora_config = dir_lora / "adapter_config.json"
|
||||||
input_model = dir_lora / "adapter_model.safetensors"
|
input_model = dir_lora / "adapter_model.safetensors"
|
||||||
|
|
||||||
|
@ -313,7 +321,10 @@ if __name__ == '__main__':
|
||||||
lparams: dict[str, Any] = json.load(f)
|
lparams: dict[str, Any] = json.load(f)
|
||||||
|
|
||||||
# load base model
|
# load base model
|
||||||
if dir_base_model is None:
|
if base_model_id is not None:
|
||||||
|
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
|
||||||
|
hparams = load_hparams_from_hf(base_model_id)
|
||||||
|
elif dir_base_model is None:
|
||||||
if "base_model_name_or_path" in lparams:
|
if "base_model_name_or_path" in lparams:
|
||||||
model_id = lparams["base_model_name_or_path"]
|
model_id = lparams["base_model_name_or_path"]
|
||||||
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
logger.info(f"Loading base model from Hugging Face: {model_id}")
|
||||||
|
@ -371,17 +382,26 @@ if __name__ == '__main__':
|
||||||
if self.lazy:
|
if self.lazy:
|
||||||
tensor = LazyTorchTensor.from_eager(tensor)
|
tensor = LazyTorchTensor.from_eager(tensor)
|
||||||
base_name = get_base_tensor_name(name)
|
base_name = get_base_tensor_name(name)
|
||||||
is_lora_a = ".lora_A.weight" in name
|
# note: lora_embedding is transposed by mergekit-extract-lora, so it's reversed here
|
||||||
is_lora_b = ".lora_B.weight" in name
|
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_B" in name
|
||||||
|
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_A" in name
|
||||||
if not is_lora_a and not is_lora_b:
|
if not is_lora_a and not is_lora_b:
|
||||||
if ".base_layer.weight" in name:
|
if ".base_layer.weight" in name:
|
||||||
continue
|
continue
|
||||||
|
# mergekit-extract-lora add these layernorm to the adapter
|
||||||
|
if ".layernorm" or ".norm" in name:
|
||||||
|
yield (base_name, tensor)
|
||||||
|
continue
|
||||||
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
||||||
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
||||||
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
||||||
logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
|
logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
# mergekit-extract-lora transposes this tensor, we need to transpose it back
|
||||||
|
if ".lora_embedding" in name:
|
||||||
|
tensor = tensor.T
|
||||||
|
|
||||||
if base_name in tensor_map:
|
if base_name in tensor_map:
|
||||||
if is_lora_a:
|
if is_lora_a:
|
||||||
tensor_map[base_name].A = tensor
|
tensor_map[base_name].A = tensor
|
||||||
|
@ -407,6 +427,13 @@ if __name__ == '__main__':
|
||||||
if name == "lm_head.weight" and len(dest) == 0:
|
if name == "lm_head.weight" and len(dest) == 0:
|
||||||
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
||||||
for dest_name, dest_data in dest:
|
for dest_name, dest_data in dest:
|
||||||
|
# mergekit-extract-lora add these layernorm to the adapter
|
||||||
|
if "_norm" in dest_name:
|
||||||
|
assert dest_data.dim() == 1
|
||||||
|
yield (dest_name, dest_data)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# otherwise, we must get the lora_A and lora_B tensors
|
||||||
assert isinstance(dest_data, LoraTorchTensor)
|
assert isinstance(dest_data, LoraTorchTensor)
|
||||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||||
|
|
||||||
|
|
|
@ -242,6 +242,9 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||||
} else {
|
} else {
|
||||||
ab_map[name].b = cur;
|
ab_map[name].b = cur;
|
||||||
}
|
}
|
||||||
|
} else if (str_endswith(name, "_norm.weight")) {
|
||||||
|
// norm only has 1 dim, so tensor b == nullptr
|
||||||
|
ab_map[name] = llama_lora_weight(cur);
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
|
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
|
||||||
}
|
}
|
||||||
|
@ -251,6 +254,9 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||||
for (auto & it : ab_map) {
|
for (auto & it : ab_map) {
|
||||||
const std::string & name = it.first;
|
const std::string & name = it.first;
|
||||||
llama_lora_weight & w = it.second;
|
llama_lora_weight & w = it.second;
|
||||||
|
if (w.is_norm) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (!w.a || !w.b) {
|
if (!w.a || !w.b) {
|
||||||
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
|
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
|
||||||
|
@ -279,6 +285,24 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||||
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
|
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// add norm vectors
|
||||||
|
for (auto & it : ab_map) {
|
||||||
|
const std::string & name = it.first;
|
||||||
|
llama_lora_weight & w = it.second;
|
||||||
|
if (w.is_norm) {
|
||||||
|
GGML_ASSERT(w.a != nullptr);
|
||||||
|
// device buft and device ctx
|
||||||
|
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
|
||||||
|
if (!model_tensor) {
|
||||||
|
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
|
||||||
|
}
|
||||||
|
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
|
||||||
|
struct ggml_tensor * tensor_norm = ggml_dup_tensor(dev_ctx, w.a);
|
||||||
|
ggml_set_name(tensor_norm, w.a->name);
|
||||||
|
adapter.ab_map[it.first] = llama_lora_weight(tensor_norm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// allocate tensors / buffers and zero
|
// allocate tensors / buffers and zero
|
||||||
{
|
{
|
||||||
adapter.ctxs.reserve(ctx_map.size());
|
adapter.ctxs.reserve(ctx_map.size());
|
||||||
|
@ -311,7 +335,9 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
||||||
auto orig = ab_map[it.first];
|
auto orig = ab_map[it.first];
|
||||||
auto dev = it.second;
|
auto dev = it.second;
|
||||||
set_tensor(orig.a, dev.a);
|
set_tensor(orig.a, dev.a);
|
||||||
set_tensor(orig.b, dev.b);
|
if (!dev.is_norm) {
|
||||||
|
set_tensor(orig.b, dev.b);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -45,7 +45,11 @@ struct llama_lora_weight {
|
||||||
struct ggml_tensor * a = nullptr;
|
struct ggml_tensor * a = nullptr;
|
||||||
struct ggml_tensor * b = nullptr;
|
struct ggml_tensor * b = nullptr;
|
||||||
|
|
||||||
|
// note: norm only has 1 dim, so tensor b == nullptr
|
||||||
|
bool is_norm = false; // is this a norm vector? (e.g. _norm.weight)
|
||||||
|
|
||||||
llama_lora_weight() = default;
|
llama_lora_weight() = default;
|
||||||
|
llama_lora_weight(struct ggml_tensor * a) : a(a), is_norm(true) {}
|
||||||
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -2545,6 +2545,28 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||||
ggml_set_input(lctx.inp_tokens);
|
ggml_set_input(lctx.inp_tokens);
|
||||||
|
|
||||||
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
||||||
|
//printf("tok_embd shape: %d x %d\n", tok_embd->ne[0], tok_embd->ne[1]);
|
||||||
|
//printf("inpL shape: %d x %d\n", inpL->ne[0], inpL->ne[1]);
|
||||||
|
|
||||||
|
// apply lora for embedding tokens if needed
|
||||||
|
for (auto & it : lctx.lora_adapters) {
|
||||||
|
struct llama_lora_weight * lora = it.first->get_weight(tok_embd);
|
||||||
|
if (lora == nullptr) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const float alpha = it.first->alpha;
|
||||||
|
const float rank = (float) lora->b->ne[0];
|
||||||
|
const float scale = alpha ? it.second * alpha / rank : it.second;
|
||||||
|
auto ss = ggml_get_rows(ctx, lora->b, lctx.inp_tokens);
|
||||||
|
//printf("a shape: %d x %d\n", lora->a->ne[0], lora->a->ne[1]);
|
||||||
|
//printf("b shape: %d x %d\n", lora->b->ne[0], lora->b->ne[1]);
|
||||||
|
//printf("ss shape: %d x %d\n", ss->ne[0], ss->ne[1]);
|
||||||
|
struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
|
||||||
|
ctx, ss, ggml_transpose(ctx, lora->a)
|
||||||
|
), scale);
|
||||||
|
//printf("inpL_delta shape: %d x %d\n", inpL_delta->ne[0], inpL_delta->ne[1]);
|
||||||
|
inpL = ggml_add(ctx, inpL, ggml_cont(ctx, ggml_transpose(ctx, inpL_delta)));
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
|
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
|
||||||
inpL = lctx.inp_embd;
|
inpL = lctx.inp_embd;
|
||||||
|
@ -3897,9 +3919,17 @@ struct llm_build_context {
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
struct ggml_tensor * attn_norm = model.layers[il].attn_norm;
|
||||||
|
for (auto & it : lctx.lora_adapters) {
|
||||||
|
struct llama_lora_weight * lora = it.first->get_weight(model.layers[il].attn_norm);
|
||||||
|
if (lora && lora->is_norm) {
|
||||||
|
attn_norm = ggml_add(ctx0, attn_norm, ggml_scale(ctx0, lora->a, 0.5));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
model.layers[il].attn_norm, NULL,
|
attn_norm, NULL,
|
||||||
LLM_NORM_RMS, cb, il);
|
LLM_NORM_RMS, cb, il);
|
||||||
cb(cur, "attn_norm", il);
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
@ -3967,8 +3997,17 @@ struct llm_build_context {
|
||||||
|
|
||||||
// feed-forward network
|
// feed-forward network
|
||||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||||
|
|
||||||
|
struct ggml_tensor * ffn_norm = model.layers[il].ffn_norm;
|
||||||
|
// for (auto & it : lctx.lora_adapters) {
|
||||||
|
// struct llama_lora_weight * lora = it.first->get_weight(ffn_norm);
|
||||||
|
// if (lora && lora->is_norm) {
|
||||||
|
// ffn_norm = ggml_add(ctx0, ffn_norm, lora->a);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
model.layers[il].ffn_norm, NULL,
|
ffn_norm, NULL,
|
||||||
LLM_NORM_RMS, cb, il);
|
LLM_NORM_RMS, cb, il);
|
||||||
cb(cur, "ffn_norm", il);
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue