lora : add support for non-llama models

ggml-ci
2023-09-26 01:35:38 +02:00 · 2023-09-26 01:35:38 +02:00 · 8c1828aa6c
commit 8c1828aa6c
parent 1726f9626f
3 changed files with 96 additions and 76 deletions
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@ -11,43 +11,14 @@ from typing import Any, BinaryIO, Sequence
 import numpy as np
 import torch
 from pathlib import Path
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
 HF_SUBLAYER_TO_GGML = {
    "self_attn.q_proj": "attn_q",
    "self_attn.k_proj": "attn_k",
    "self_attn.v_proj": "attn_v",
    "self_attn.o_proj": "attn_output",
    "mlp.gate_proj": "ffn_gate",
    "mlp.down_proj": "ffn_down",
    "mlp.up_proj": "ffn_up",
    "input_layernorm": "attn_norm",
    "post_attention_layernorm": "ffn_norm",
 }
 def translate_tensor_name(t: str) -> str:
    match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
    if match:
        nn = match.group(1)
        sub_layer = match.group(2)
        lora_type = match.group(3)
        sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
        if sub_layer_renamed is None:
            print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
            sys.exit(1)
        output_string = (
            f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
        )
        return output_string
    else:
        print(f"Error: unrecognized tensor {t}")
        sys.exit(1)
 def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(b"ggla"[::-1])  # magic (ggml lora)
    fout.write(struct.pack("i", 1))  # file version
@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(struct.pack("i", int(params["lora_alpha"])))
-def write_tensor_header(
+def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
    self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
 ) -> None:
    sname = name.encode("utf-8")
    fout.write(
        struct.pack(
@ -78,11 +47,12 @@ def write_tensor_header(
    fout.seek((fout.tell() + 31) & -32)
-if len(sys.argv) != 2:
+if len(sys.argv) < 2:
-    print(f"Usage: python {sys.argv[0]} <path>")
+    print(f"Usage: python {sys.argv[0]} <path> [arch]")
    print(
        "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
    )
    print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
    sys.exit(1)
 input_json = os.path.join(sys.argv[1], "adapter_config.json")
@ -90,6 +60,14 @@ input_model = os.path.join(sys.argv[1], "adapter_model.bin")
 output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
 model = torch.load(input_model, map_location="cpu")
 arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
 if arch_name not in gguf.MODEL_ARCH_NAMES.values():
    print(f"Error: unsupported architecture {arch_name}")
    sys.exit(1)
 arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
 name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
 with open(input_json, "r") as f:
    params = json.load(f)
@ -117,6 +95,7 @@ with open(output_path, "wb") as fout:
    write_file_header(fout, params)
    for k, v in model.items():
        orig_k = k
        if k.endswith(".default.weight"):
            k = k.replace(".default.weight", ".weight")
        if k in ["llama_proj.weight", "llama_proj.bias"]:
@ -129,7 +108,32 @@ with open(output_path, "wb") as fout:
            v = v.float()
        t = v.detach().numpy()
-        tname = translate_tensor_name(k)
+
        prefix = "base_model.model."
        if k.startswith(prefix):
            k = k[len(prefix) :]
        lora_suffixes = (".lora_A.weight", ".lora_B.weight")
        if k.endswith(lora_suffixes):
            suffix = k[-len(lora_suffixes[0]):]
            k = k[: -len(lora_suffixes[0])]
        else:
            print(f"Error: unrecognized tensor name {orig_k}")
            sys.exit(1)
        tname = name_map.get_name(k)
        if tname is None:
            print(f"Error: could not map tensor name {orig_k}")
            print(f" Note: the arch parameter must be specified if the model is not llama")
            sys.exit(1)
        if suffix == ".lora_A.weight":
            tname += ".weight.loraA"
        elif suffix == ".lora_B.weight":
            tname += ".weight.loraB"
        else:
            assert False
        print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
        write_tensor_header(fout, tname, t.shape, t.dtype)
        t.tofile(fout)
--- a/llama.cpp
+++ b/llama.cpp
@ -5906,36 +5906,50 @@ static int llama_apply_lora_from_file_internal(
    const int64_t t_start_lora_us = ggml_time_us();
-    auto fin = std::ifstream(path_lora, std::ios::binary);
+    // auto fin = std::ifstream(path_lora, std::ios::binary);
-    if (!fin) {
+    llama_file fin(path_lora, "rb");
        LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
        return 1;
    }
    // verify magic and version
    {
-        uint32_t magic;
+        uint32_t magic = fin.read_u32();
-        fin.read((char *) &magic, sizeof(magic));
+        if (magic != LLAMA_FILE_MAGIC_GGLA) {
-        uint32_t format_version;
+            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
-        fin.read((char *) &format_version, sizeof(format_version));
+            return 1;
        }
        uint32_t format_version = fin.read_u32();
        if (format_version != 1) {
            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
            return 1;
        }
    }
-    int32_t lora_r;
+    int32_t lora_r = fin.read_u32();
-    int32_t lora_alpha;
+    int32_t lora_alpha = fin.read_u32();
    fin.read((char *) &lora_r, sizeof(lora_r));
    fin.read((char *) &lora_alpha, sizeof(lora_alpha));
    float scaling = (float)lora_alpha / (float)lora_r;
    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
    // create a name -> tensor map of the model to accelerate lookups
    size_t max_tensor_size = 0;
    std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
    for (const auto & kv : model.tensors_by_name) {
        model_tensors.insert(kv);
        // find the max tensor size to estimate the required temporary buffer size
        // skip input and output layers as they are not often finetuned and can be very large
        if (kv.first.find("token_embd") != std::string::npos ||
            kv.first.find("output") != std::string::npos) {
            continue;
        }
        size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
        max_tensor_size = std::max(max_tensor_size, f32_size);
    }
    // create a temporary ggml context to store the lora tensors
-    // todo: calculate size from biggest possible tensor
+    // TODO: use ggml-alloc
-    std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
+    size_t lora_ctx_size = max_tensor_size * 3;
    LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
    std::vector<uint8_t> lora_buf(lora_ctx_size);
    struct ggml_init_params params;
    params.mem_size   = lora_buf.size();
    params.mem_buffer = lora_buf.data();
@ -5944,11 +5958,6 @@ static int llama_apply_lora_from_file_internal(
    ggml_context * lora_ctx = ggml_init(params);
    std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
    // create a name -> tensor map of the model to accelerate lookups
    std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
    for (const auto & kv : model.tensors_by_name) {
        model_tensors.insert(kv);
    }
    // load base model
    std::unique_ptr<llama_model_loader> ml;
@ -5983,27 +5992,32 @@ static int llama_apply_lora_from_file_internal(
    std::vector<uint8_t> work_buffer;
    while (true) {
-        int32_t n_dims;
+        if (fin.tell() == fin.size) {
-        int32_t length;
+            // eof
        int32_t ftype;
        fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
        fin.read(reinterpret_cast<char *>(&length), sizeof(length));
        fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
        if (fin.eof()) {
            break;
        }
        int32_t n_dims;
        int32_t name_len;
        int32_t ftype;
        fin.read_raw(&n_dims, sizeof(n_dims));
        fin.read_raw(&name_len, sizeof(name_len));
        fin.read_raw(&ftype,  sizeof(ftype));
        GGML_ASSERT(n_dims <= 2);
        int32_t ne[2] = { 1, 1 };
        for (int i = 0; i < n_dims; ++i) {
-            fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+            fin.read_raw(&ne[i], sizeof(ne[i]));
        }
        std::string name;
        {
            GGML_ASSERT(name_len <= 1024);
            char buf[1024];
-            fin.read(buf, length);
+            fin.read_raw(buf, name_len);
-            name = std::string(buf, length);
+            name = std::string(buf, name_len);
        }
        // check for lora suffix and get the type of tensor
@ -6017,7 +6031,7 @@ static int llama_apply_lora_from_file_internal(
        std::string lora_type = name.substr(pos + lora_suffix.length());
        std::string base_name = name;
        base_name.erase(pos);
-        // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
+        // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
        if (model_tensors.find(base_name) == model_tensors.end()) {
            LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
@ -6047,11 +6061,11 @@ static int llama_apply_lora_from_file_internal(
        ggml_set_name(lora_tensor, "lora_tensor");
        // load tensor data
-        size_t offset = fin.tellg();
+        size_t offset = fin.tell();
        size_t tensor_data_size = ggml_nbytes(lora_tensor);
        offset = (offset + 31) & -32;
-        fin.seekg(offset);
+        fin.seek(offset, SEEK_SET);
-        fin.read((char*)lora_tensor->data, tensor_data_size);
+        fin.read_raw(lora_tensor->data, tensor_data_size);
        lora_tensors[name] = lora_tensor;
@ -6150,6 +6164,7 @@ static int llama_apply_lora_from_file_internal(
            ggml_graph_compute_helper(work_buffer, &gf, n_threads);
            // we won't need these tensors again, reset the context to save memory
            GGML_ASSERT(lora_tensors.size() == 2);
            ggml_free(lora_ctx);
            lora_ctx = ggml_init(params);
            lora_tensors.clear();
--- a/llama.h
+++ b/llama.h
@ -37,6 +37,7 @@
 #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN