Refactor lora adapter support (#8332)
* lora: load to devide buft
* add patch tensor function
* correct tensor patch
* llama_lora_adapter_apply
* correct ggml_backend_tensor_copy
* add llm_build_mm
* fix auto merge
* update based on review comments
* add convert script
* no more transpose A
* add f16 convert
* add metadata check
* add sanity check
* fix ftype
* add requirements
* fix requirements
* fix outfile
* conversion: only allow selected models
* fix types
* cuda : do not use dmmv if the tensor does not have enough cols
* llama : lora fixes
* do not disable mmap with lora
Co-authored-by: slaren <slarengh@gmail.com>
* llm_build_lora_mm_id
* convert_lora : MoE LoRA conversion support
* convert_lora : prefer safetensors, similarly to convert_hf
* convert_hf : simplify modify_tensors for InternLM2
* convert_lora : lazy conversion
* llama : load and use alpha from LoRA adapters
* llama : use llm_build_lora_mm in most model graphs
* auto scale
* Revert "auto scale"
This reverts commit 42415a4874
.
* remove redundant params
* Apply suggestions from code review
Co-authored-by: slaren <slarengh@gmail.com>
* change kv metadata
* move add_type to __init__
* convert_hf : move add_type to main()
* convert_lora : use the GGUFWriter from Model instead of overwriting it
---------
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Francis Couture-Harpin <git@compilade.net>
This commit is contained in:
parent
4db8f60fe7
commit
97bdd26eee
12 changed files with 963 additions and 530 deletions
|
@ -19,6 +19,7 @@ GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
|
|||
|
||||
class Keys:
|
||||
class General:
|
||||
TYPE = "general.type"
|
||||
ARCHITECTURE = "general.architecture"
|
||||
QUANTIZATION_VERSION = "general.quantization_version"
|
||||
ALIGNMENT = "general.alignment"
|
||||
|
@ -120,11 +121,20 @@ class Keys:
|
|||
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
||||
EOT_ID = "tokenizer.ggml.eot_token_id"
|
||||
|
||||
class Adapter:
|
||||
TYPE = "adapter.type"
|
||||
LORA_ALPHA = "adapter.lora.alpha"
|
||||
|
||||
#
|
||||
# recommended mapping of model tensor names for storage in gguf
|
||||
#
|
||||
|
||||
|
||||
class GGUFType:
|
||||
MODEL = "model"
|
||||
ADAPTER = "adapter"
|
||||
|
||||
|
||||
class MODEL_ARCH(IntEnum):
|
||||
LLAMA = auto()
|
||||
FALCON = auto()
|
||||
|
|
|
@ -424,6 +424,9 @@ class GGUFWriter:
|
|||
fout.close()
|
||||
self.fout = None
|
||||
|
||||
def add_type(self, type_name: str) -> None:
|
||||
self.add_string(Keys.General.TYPE, type_name)
|
||||
|
||||
def add_architecture(self) -> None:
|
||||
self.add_string(Keys.General.ARCHITECTURE, self.arch)
|
||||
|
||||
|
|
|
@ -43,7 +43,7 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.
|
|||
osize *= dim
|
||||
out = np.empty(shape=osize, dtype=otype)
|
||||
# compute over groups of 16 rows (arbitrary, but seems good for performance)
|
||||
n_groups = rows.shape[0] // 16
|
||||
n_groups = (rows.shape[0] // 16) or 1
|
||||
np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
|
||||
return out.reshape(oshape)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue