lora : add support for non-llama models

ggml-ci
This commit is contained in:
slaren 2023-09-26 01:35:38 +02:00
parent 1726f9626f
commit 8c1828aa6c
3 changed files with 96 additions and 76 deletions

View file

@ -11,43 +11,14 @@ from typing import Any, BinaryIO, Sequence
import numpy as np import numpy as np
import torch import torch
from pathlib import Path
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
HF_SUBLAYER_TO_GGML = {
"self_attn.q_proj": "attn_q",
"self_attn.k_proj": "attn_k",
"self_attn.v_proj": "attn_v",
"self_attn.o_proj": "attn_output",
"mlp.gate_proj": "ffn_gate",
"mlp.down_proj": "ffn_down",
"mlp.up_proj": "ffn_up",
"input_layernorm": "attn_norm",
"post_attention_layernorm": "ffn_norm",
}
def translate_tensor_name(t: str) -> str:
match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
if match:
nn = match.group(1)
sub_layer = match.group(2)
lora_type = match.group(3)
sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
if sub_layer_renamed is None:
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
sys.exit(1)
output_string = (
f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
)
return output_string
else:
print(f"Error: unrecognized tensor {t}")
sys.exit(1)
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
fout.write(b"ggla"[::-1]) # magic (ggml lora) fout.write(b"ggla"[::-1]) # magic (ggml lora)
fout.write(struct.pack("i", 1)) # file version fout.write(struct.pack("i", 1)) # file version
@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
fout.write(struct.pack("i", int(params["lora_alpha"]))) fout.write(struct.pack("i", int(params["lora_alpha"])))
def write_tensor_header( def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
) -> None:
sname = name.encode("utf-8") sname = name.encode("utf-8")
fout.write( fout.write(
struct.pack( struct.pack(
@ -78,11 +47,12 @@ def write_tensor_header(
fout.seek((fout.tell() + 31) & -32) fout.seek((fout.tell() + 31) & -32)
if len(sys.argv) != 2: if len(sys.argv) < 2:
print(f"Usage: python {sys.argv[0]} <path>") print(f"Usage: python {sys.argv[0]} <path> [arch]")
print( print(
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
) )
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
sys.exit(1) sys.exit(1)
input_json = os.path.join(sys.argv[1], "adapter_config.json") input_json = os.path.join(sys.argv[1], "adapter_config.json")
@ -90,6 +60,14 @@ input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
model = torch.load(input_model, map_location="cpu") model = torch.load(input_model, map_location="cpu")
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
print(f"Error: unsupported architecture {arch_name}")
sys.exit(1)
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
with open(input_json, "r") as f: with open(input_json, "r") as f:
params = json.load(f) params = json.load(f)
@ -117,6 +95,7 @@ with open(output_path, "wb") as fout:
write_file_header(fout, params) write_file_header(fout, params)
for k, v in model.items(): for k, v in model.items():
orig_k = k
if k.endswith(".default.weight"): if k.endswith(".default.weight"):
k = k.replace(".default.weight", ".weight") k = k.replace(".default.weight", ".weight")
if k in ["llama_proj.weight", "llama_proj.bias"]: if k in ["llama_proj.weight", "llama_proj.bias"]:
@ -129,7 +108,32 @@ with open(output_path, "wb") as fout:
v = v.float() v = v.float()
t = v.detach().numpy() t = v.detach().numpy()
tname = translate_tensor_name(k)
prefix = "base_model.model."
if k.startswith(prefix):
k = k[len(prefix) :]
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
if k.endswith(lora_suffixes):
suffix = k[-len(lora_suffixes[0]):]
k = k[: -len(lora_suffixes[0])]
else:
print(f"Error: unrecognized tensor name {orig_k}")
sys.exit(1)
tname = name_map.get_name(k)
if tname is None:
print(f"Error: could not map tensor name {orig_k}")
print(f" Note: the arch parameter must be specified if the model is not llama")
sys.exit(1)
if suffix == ".lora_A.weight":
tname += ".weight.loraA"
elif suffix == ".lora_B.weight":
tname += ".weight.loraB"
else:
assert False
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype) write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout) t.tofile(fout)

View file

@ -5906,36 +5906,50 @@ static int llama_apply_lora_from_file_internal(
const int64_t t_start_lora_us = ggml_time_us(); const int64_t t_start_lora_us = ggml_time_us();
auto fin = std::ifstream(path_lora, std::ios::binary); // auto fin = std::ifstream(path_lora, std::ios::binary);
if (!fin) { llama_file fin(path_lora, "rb");
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
return 1;
}
// verify magic and version // verify magic and version
{ {
uint32_t magic; uint32_t magic = fin.read_u32();
fin.read((char *) &magic, sizeof(magic)); if (magic != LLAMA_FILE_MAGIC_GGLA) {
uint32_t format_version; LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
fin.read((char *) &format_version, sizeof(format_version)); return 1;
}
uint32_t format_version = fin.read_u32();
if (format_version != 1) { if (format_version != 1) {
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
return 1; return 1;
} }
} }
int32_t lora_r; int32_t lora_r = fin.read_u32();
int32_t lora_alpha; int32_t lora_alpha = fin.read_u32();
fin.read((char *) &lora_r, sizeof(lora_r));
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
float scaling = (float)lora_alpha / (float)lora_r; float scaling = (float)lora_alpha / (float)lora_r;
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
// create a name -> tensor map of the model to accelerate lookups
size_t max_tensor_size = 0;
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
for (const auto & kv : model.tensors_by_name) {
model_tensors.insert(kv);
// find the max tensor size to estimate the required temporary buffer size
// skip input and output layers as they are not often finetuned and can be very large
if (kv.first.find("token_embd") != std::string::npos ||
kv.first.find("output") != std::string::npos) {
continue;
}
size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
max_tensor_size = std::max(max_tensor_size, f32_size);
}
// create a temporary ggml context to store the lora tensors // create a temporary ggml context to store the lora tensors
// todo: calculate size from biggest possible tensor // TODO: use ggml-alloc
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull); size_t lora_ctx_size = max_tensor_size * 3;
LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
std::vector<uint8_t> lora_buf(lora_ctx_size);
struct ggml_init_params params; struct ggml_init_params params;
params.mem_size = lora_buf.size(); params.mem_size = lora_buf.size();
params.mem_buffer = lora_buf.data(); params.mem_buffer = lora_buf.data();
@ -5944,11 +5958,6 @@ static int llama_apply_lora_from_file_internal(
ggml_context * lora_ctx = ggml_init(params); ggml_context * lora_ctx = ggml_init(params);
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors; std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
// create a name -> tensor map of the model to accelerate lookups
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
for (const auto & kv : model.tensors_by_name) {
model_tensors.insert(kv);
}
// load base model // load base model
std::unique_ptr<llama_model_loader> ml; std::unique_ptr<llama_model_loader> ml;
@ -5983,27 +5992,32 @@ static int llama_apply_lora_from_file_internal(
std::vector<uint8_t> work_buffer; std::vector<uint8_t> work_buffer;
while (true) { while (true) {
int32_t n_dims; if (fin.tell() == fin.size) {
int32_t length; // eof
int32_t ftype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
if (fin.eof()) {
break; break;
} }
int32_t n_dims;
int32_t name_len;
int32_t ftype;
fin.read_raw(&n_dims, sizeof(n_dims));
fin.read_raw(&name_len, sizeof(name_len));
fin.read_raw(&ftype, sizeof(ftype));
GGML_ASSERT(n_dims <= 2);
int32_t ne[2] = { 1, 1 }; int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) { for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i])); fin.read_raw(&ne[i], sizeof(ne[i]));
} }
std::string name; std::string name;
{ {
GGML_ASSERT(name_len <= 1024);
char buf[1024]; char buf[1024];
fin.read(buf, length); fin.read_raw(buf, name_len);
name = std::string(buf, length); name = std::string(buf, name_len);
} }
// check for lora suffix and get the type of tensor // check for lora suffix and get the type of tensor
@ -6017,7 +6031,7 @@ static int llama_apply_lora_from_file_internal(
std::string lora_type = name.substr(pos + lora_suffix.length()); std::string lora_type = name.substr(pos + lora_suffix.length());
std::string base_name = name; std::string base_name = name;
base_name.erase(pos); base_name.erase(pos);
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
if (model_tensors.find(base_name) == model_tensors.end()) { if (model_tensors.find(base_name) == model_tensors.end()) {
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
@ -6047,11 +6061,11 @@ static int llama_apply_lora_from_file_internal(
ggml_set_name(lora_tensor, "lora_tensor"); ggml_set_name(lora_tensor, "lora_tensor");
// load tensor data // load tensor data
size_t offset = fin.tellg(); size_t offset = fin.tell();
size_t tensor_data_size = ggml_nbytes(lora_tensor); size_t tensor_data_size = ggml_nbytes(lora_tensor);
offset = (offset + 31) & -32; offset = (offset + 31) & -32;
fin.seekg(offset); fin.seek(offset, SEEK_SET);
fin.read((char*)lora_tensor->data, tensor_data_size); fin.read_raw(lora_tensor->data, tensor_data_size);
lora_tensors[name] = lora_tensor; lora_tensors[name] = lora_tensor;
@ -6150,6 +6164,7 @@ static int llama_apply_lora_from_file_internal(
ggml_graph_compute_helper(work_buffer, &gf, n_threads); ggml_graph_compute_helper(work_buffer, &gf, n_threads);
// we won't need these tensors again, reset the context to save memory // we won't need these tensors again, reset the context to save memory
GGML_ASSERT(lora_tensors.size() == 2);
ggml_free(lora_ctx); ggml_free(lora_ctx);
lora_ctx = ggml_init(params); lora_ctx = ggml_init(params);
lora_tensors.clear(); lora_tensors.clear();

View file

@ -37,6 +37,7 @@
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN