diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index 535381ecb..c6d48b976 100644 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -44,18 +44,18 @@ NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = { } HF_SUBLAYER_TO_GGML = { - "self_attn.q_proj": "attention.wq.weight", - "self_attn.k_proj": "attention.wk.weight", - "self_attn.v_proj": "attention.wv.weight", - "self_attn.o_proj": "attention.wo.weight", - # "embed_tokens.weight": "tok_embeddings.weight", - # "norm.weight": "norm.weight", - # "lm_head.weight": "output.weight", - # "mlp.gate_proj": "feed_forward.w1.weight", - # "mlp.down_proj": "feed_forward.w2.weight", - # "mlp.up_proj": "feed_forward.w3.weight", - # "input_layernorm": "attention_norm.weight", - # "post_attention_layernorm": "ffn_norm.weight", + "self_attn.q_proj": "attention.wq", + "self_attn.k_proj": "attention.wk", + "self_attn.v_proj": "attention.wv", + "self_attn.o_proj": "attention.wo", + "mlp.gate_proj": "feed_forward.w1", + "mlp.down_proj": "feed_forward.w2", + "mlp.up_proj": "feed_forward.w3", + "input_layernorm": "attention_norm", + "post_attention_layernorm": "ffn_norm", + # "norm": "norm", + # "embed_tokens": "tok_embeddings", + # "lm_head": "output", } @@ -71,7 +71,9 @@ def translate_tensor_name(t): print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}") sys.exit(1) - output_string = f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.lora{lora_type}" + output_string = ( + f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}" + ) return output_string else: print(f"Error: unrecognized tensor {t}") @@ -138,16 +140,17 @@ with open(output_path, "wb") as fout: write_file_header(fout, params) for k, v in model.items(): - # since ggml doesn't always support other types for the second operand, - # the tensors are always converted and exported as f32 - v = v.float() + if k.endswith("lora_A.weight"): + if v.dtype != torch.float16 and v.dtype != torch.float32: + v = v.float() + v = v.T + else: + v = v.float() + t = v.numpy() - if "lora_A" in k: - t = t.T - print( - f"{k} => {translate_tensor_name(k)} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB" - ) - write_tensor_header(fout, translate_tensor_name(k), t.shape, t.dtype) + tname = translate_tensor_name(k) + print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") + write_tensor_header(fout, tname, t.shape, t.dtype) t.tofile(fout) print(f"Converted {input_json} and {input_model} to {output_path}") diff --git a/ggml.c b/ggml.c index 3dffae8b3..8606e9344 100644 --- a/ggml.c +++ b/ggml.c @@ -5955,11 +5955,6 @@ static void ggml_compute_forward_add_q_f32( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne3 == ne03); - GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1); GGML_ASSERT(dst->type == src0->type); GGML_ASSERT(src1->type == GGML_TYPE_F32); diff --git a/llama.cpp b/llama.cpp index db534ddb7..0627c9b9c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -617,6 +617,7 @@ struct llama_model_loader { throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); } + return get_tensor_for(lt); } @@ -1799,7 +1800,8 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor // create a temporary ggml context to store the lora tensors - std::vector buf(1024 * 1024 * 100); + // todo: calculate size from biggest possible tensor + std::vector buf(1024ull * 1024ull * 1024ull); struct ggml_init_params params; params.mem_size = buf.size(); params.mem_buffer = buf.data(); @@ -1830,11 +1832,9 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor break; } - int32_t nelements = 1; int32_t ne[2] = { 1, 1 }; for (int i = 0; i < n_dims; ++i) { fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; } std::string name(length, 0); @@ -1903,24 +1903,26 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor } // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA); + ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); if (scaling != 1.0f) { ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); BA = ggml_scale(lora_ctx, BA, scale_tensor); } + //printf("%s: (B)(%d %d %d %d) x (A)(%d %d %d %d) => (BA)(%d %d %d %d) + (T)(%d %d %d %d)\n", + // base_name.c_str(), + // (int)loraB->ne[0], (int)loraB->ne[1], (int)loraB->ne[2], (int)loraB->ne[3], + // (int)loraA->ne[0], (int)loraA->ne[1], (int)loraA->ne[2], (int)loraA->ne[3], + // (int)BA->ne[0], (int)BA->ne[1], (int)BA->ne[2], (int)BA->ne[3], + // (int)tensor->ne[0], (int)tensor->ne[1], (int)tensor->ne[2], (int)tensor->ne[3] + //); ggml_tensor * r = ggml_add_inplace(lora_ctx, tensor, BA); - //ggml_tensor * r = ggml_add(lora_ctx, tensor, BA); - //r = ggml_cpy(lora_ctx, r, tensor); struct ggml_cgraph gf = ggml_build_forward(r); gf.n_threads = n_threads; ggml_graph_compute(lora_ctx, &gf); - // hack until ggml_cpy supports quantized tensors - // memcpy(tensor->data, r->data, ggml_nbytes(tensor)); - // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); lora_ctx = ggml_init(params);