diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index ed1014cae..644b1aee4 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str: base_name = lora_tensor_name.replace("base_model.model.", "") base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight") + # models produced by mergekit-extract-lora have token embeddings in the adapter + base_name = base_name.replace(".lora_embedding_A", ".weight") + base_name = base_name.replace(".lora_embedding_B", ".weight") return base_name @@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace: "--base", type=Path, help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config", ) + parser.add_argument( + "--base-model-id", type=str, + help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')", + ) parser.add_argument( "lora_path", type=Path, help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", @@ -290,6 +297,7 @@ if __name__ == '__main__': dir_base_model: Path | None = args.base dir_lora: Path = args.lora_path + base_model_id: str | None = args.base_model_id lora_config = dir_lora / "adapter_config.json" input_model = dir_lora / "adapter_model.safetensors" @@ -313,7 +321,10 @@ if __name__ == '__main__': lparams: dict[str, Any] = json.load(f) # load base model - if dir_base_model is None: + if base_model_id is not None: + logger.info(f"Loading base model from Hugging Face: {base_model_id}") + hparams = load_hparams_from_hf(base_model_id) + elif dir_base_model is None: if "base_model_name_or_path" in lparams: model_id = lparams["base_model_name_or_path"] logger.info(f"Loading base model from Hugging Face: {model_id}") @@ -371,17 +382,26 @@ if __name__ == '__main__': if self.lazy: tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) - is_lora_a = ".lora_A.weight" in name - is_lora_b = ".lora_B.weight" in name + # note: lora_embedding is transposed by mergekit-extract-lora, so it's reversed here + is_lora_a = ".lora_A.weight" in name or ".lora_embedding_B" in name + is_lora_b = ".lora_B.weight" in name or ".lora_embedding_A" in name if not is_lora_a and not is_lora_b: if ".base_layer.weight" in name: continue + # mergekit-extract-lora add these layernorm to the adapter + if ".layernorm" or ".norm" in name: + yield (base_name, tensor) + continue logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") if ".embed_tokens.weight" in name or ".lm_head.weight" in name: logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning") logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948") sys.exit(1) + # mergekit-extract-lora transposes this tensor, we need to transpose it back + if ".lora_embedding" in name: + tensor = tensor.T + if base_name in tensor_map: if is_lora_a: tensor_map[base_name].A = tensor @@ -407,6 +427,13 @@ if __name__ == '__main__': if name == "lm_head.weight" and len(dest) == 0: raise ValueError("lm_head is present in adapter, but is ignored in base model") for dest_name, dest_data in dest: + # mergekit-extract-lora add these layernorm to the adapter + if "_norm" in dest_name: + assert dest_data.dim() == 1 + yield (dest_name, dest_data) + continue + + # otherwise, we must get the lora_A and lora_B tensors assert isinstance(dest_data, LoraTorchTensor) lora_a, lora_b = dest_data.get_lora_A_B() diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 9fd7edea3..0ba2c3d1e 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -242,6 +242,9 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char } else { ab_map[name].b = cur; } + } else if (str_endswith(name, "_norm.weight")) { + // norm only has 1 dim, so tensor b == nullptr + ab_map[name] = llama_lora_weight(cur); } else { throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); } @@ -251,6 +254,9 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char for (auto & it : ab_map) { const std::string & name = it.first; llama_lora_weight & w = it.second; + if (w.is_norm) { + continue; + } if (!w.a || !w.b) { throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); @@ -279,6 +285,24 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b); } + // add norm vectors + for (auto & it : ab_map) { + const std::string & name = it.first; + llama_lora_weight & w = it.second; + if (w.is_norm) { + GGML_ASSERT(w.a != nullptr); + // device buft and device ctx + auto * model_tensor = llama_model_get_tensor(model, name.c_str()); + if (!model_tensor) { + throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); + } + struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); + struct ggml_tensor * tensor_norm = ggml_dup_tensor(dev_ctx, w.a); + ggml_set_name(tensor_norm, w.a->name); + adapter.ab_map[it.first] = llama_lora_weight(tensor_norm); + } + } + // allocate tensors / buffers and zero { adapter.ctxs.reserve(ctx_map.size()); @@ -311,7 +335,9 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char auto orig = ab_map[it.first]; auto dev = it.second; set_tensor(orig.a, dev.a); - set_tensor(orig.b, dev.b); + if (!dev.is_norm) { + set_tensor(orig.b, dev.b); + } } } diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 5f1870cc8..aff0d8c0b 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -45,7 +45,11 @@ struct llama_lora_weight { struct ggml_tensor * a = nullptr; struct ggml_tensor * b = nullptr; + // note: norm only has 1 dim, so tensor b == nullptr + bool is_norm = false; // is this a norm vector? (e.g. _norm.weight) + llama_lora_weight() = default; + llama_lora_weight(struct ggml_tensor * a) : a(a), is_norm(true) {} llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {} }; diff --git a/src/llama.cpp b/src/llama.cpp index 8ea6686c9..9c52afeba 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2545,6 +2545,28 @@ static struct ggml_tensor * llm_build_inp_embd( ggml_set_input(lctx.inp_tokens); inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); + //printf("tok_embd shape: %d x %d\n", tok_embd->ne[0], tok_embd->ne[1]); + //printf("inpL shape: %d x %d\n", inpL->ne[0], inpL->ne[1]); + + // apply lora for embedding tokens if needed + for (auto & it : lctx.lora_adapters) { + struct llama_lora_weight * lora = it.first->get_weight(tok_embd); + if (lora == nullptr) { + continue; + } + const float alpha = it.first->alpha; + const float rank = (float) lora->b->ne[0]; + const float scale = alpha ? it.second * alpha / rank : it.second; + auto ss = ggml_get_rows(ctx, lora->b, lctx.inp_tokens); + //printf("a shape: %d x %d\n", lora->a->ne[0], lora->a->ne[1]); + //printf("b shape: %d x %d\n", lora->b->ne[0], lora->b->ne[1]); + //printf("ss shape: %d x %d\n", ss->ne[0], ss->ne[1]); + struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat( + ctx, ss, ggml_transpose(ctx, lora->a) + ), scale); + //printf("inpL_delta shape: %d x %d\n", inpL_delta->ne[0], inpL_delta->ne[1]); + inpL = ggml_add(ctx, inpL, ggml_cont(ctx, ggml_transpose(ctx, inpL_delta))); + } } else { lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens); inpL = lctx.inp_embd; @@ -3897,9 +3919,17 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; + struct ggml_tensor * attn_norm = model.layers[il].attn_norm; + for (auto & it : lctx.lora_adapters) { + struct llama_lora_weight * lora = it.first->get_weight(model.layers[il].attn_norm); + if (lora && lora->is_norm) { + attn_norm = ggml_add(ctx0, attn_norm, ggml_scale(ctx0, lora->a, 0.5)); + } + } + // norm cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, + attn_norm, NULL, LLM_NORM_RMS, cb, il); cb(cur, "attn_norm", il); @@ -3967,8 +3997,17 @@ struct llm_build_context { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { + + struct ggml_tensor * ffn_norm = model.layers[il].ffn_norm; + // for (auto & it : lctx.lora_adapters) { + // struct llama_lora_weight * lora = it.first->get_weight(ffn_norm); + // if (lora && lora->is_norm) { + // ffn_norm = ggml_add(ctx0, ffn_norm, lora->a); + // } + // } + cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, + ffn_norm, NULL, LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il);