From ac3fbe492accb741a7796061fa4bb277686ff8b5 Mon Sep 17 00:00:00 2001 From: Slaren <2141330+slaren@users.noreply.github.com> Date: Sat, 8 Apr 2023 03:37:12 +0200 Subject: [PATCH] Export lora A matrix pre-transposed --- convert-lora-to-ggml.py | 2 ++ llama.cpp | 12 ++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index 988627181..ef1aa5305 100644 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -94,6 +94,8 @@ with open(output_path, "wb") as fout: # since ggml doesn't always support other types for the second operand, # the tensors are always converted and exported as f32 t = v.float().numpy() + if "lora_A" in k: + t = t.T print(f"{k} => {translate_tensor_name(k)} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") write_tensor_header(fout, translate_tensor_name(k), t.shape, t.dtype) t.tofile(fout) diff --git a/llama.cpp b/llama.cpp index ba1f089b8..bb7d3e2d9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1760,8 +1760,12 @@ int llama_model_quantize( int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, int n_threads) { // TODO: refactor all of this after PR #801 + fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); + auto & model = ctx->model; + const int64_t t_start_lora_us = ggml_time_us(); + auto fin = std::ifstream(path_lora, std::ios::binary); if (!fin) { fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora); @@ -1874,7 +1878,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { ggml_tensor * tensor = model.tensors[base_name]; - ggml_tensor * loraA = ggml_transpose(lora_ctx, lora_tensors[base_name + ".loraA"]); + ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; if (tensor->ne[0] != loraA->ne[1]) { @@ -1901,7 +1905,11 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor fprintf(stderr, "."); } } - fprintf(stderr, " done\n"); + + ggml_free(lora_ctx); + + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; + fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0); return 0; }