Refactor lora adapter support (#8332)

* lora: load to devide buft

* add patch tensor function

* correct tensor patch

* llama_lora_adapter_apply

* correct ggml_backend_tensor_copy

* add llm_build_mm

* fix auto merge

* update based on review comments

* add convert script

* no more transpose A

* add f16 convert

* add metadata check

* add sanity check

* fix ftype

* add requirements

* fix requirements

* fix outfile

* conversion: only allow selected models

* fix types

* cuda : do not use dmmv if the tensor does not have enough cols

* llama : lora fixes

* do not disable mmap with lora

Co-authored-by: slaren <slarengh@gmail.com>

* llm_build_lora_mm_id

* convert_lora : MoE LoRA conversion support

* convert_lora : prefer safetensors, similarly to convert_hf

* convert_hf : simplify modify_tensors for InternLM2

* convert_lora : lazy conversion

* llama : load and use alpha from LoRA adapters

* llama : use llm_build_lora_mm in most model graphs

* auto scale

* Revert "auto scale"

This reverts commit 42415a4874.

* remove redundant params

* Apply suggestions from code review

Co-authored-by: slaren <slarengh@gmail.com>

* change kv metadata

* move add_type to __init__

* convert_hf : move add_type to main()

* convert_lora : use the GGUFWriter from Model instead of overwriting it

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Francis Couture-Harpin <git@compilade.net>
This commit is contained in:
Xuan Son Nguyen 2024-07-15 20:50:47 +02:00 committed by GitHub
parent 4db8f60fe7
commit 97bdd26eee
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 963 additions and 530 deletions

View file

@ -2264,13 +2264,6 @@ class InternLM2Model(Model):
special_vocab.add_to_gguf(self.gguf_writer)
def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def set_gguf_parameters(self):
self.gguf_writer.add_name("InternLM2")
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
@ -2290,26 +2283,22 @@ class InternLM2Model(Model):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
num_heads = self.hparams["num_attention_heads"]
num_kv_heads = self.hparams["num_key_value_heads"]
hidden_size = self.hparams["hidden_size"]
n_embd = self.hparams["hidden_size"]
q_per_kv = num_heads // num_kv_heads
head_dim = hidden_size // num_heads
head_dim = n_embd // num_heads
num_groups = num_heads // q_per_kv
qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
if re.match(qkv_pattern, name):
bid = re.findall(qkv_pattern, name)[0]
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
qkv = data_torch
# qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim))
q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
# The model weights of q and k equire additional reshape.
# q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads)
# k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads)
# v = rearrange(v, " o g n i -> o (g n i)").T
v = v.reshape((v.shape[0], -1)).T
q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
v = v.reshape((-1, v.shape[-1]))
return [
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
@ -3585,6 +3574,7 @@ def main() -> None:
small_first_shard=args.no_tensor_first_split)
logger.info("Set model parameters")
model_instance.gguf_writer.add_type(gguf.GGUFType.MODEL)
model_instance.set_gguf_parameters()
logger.info("Set model tokenizer")