Support more layer types, fix memory and generation issues

This commit is contained in:
Slaren 2023-04-11 23:15:29 +02:00
parent c920f00136
commit c45868ba9f
3 changed files with 36 additions and 36 deletions

View file

@ -44,18 +44,18 @@ NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {
} }
HF_SUBLAYER_TO_GGML = { HF_SUBLAYER_TO_GGML = {
"self_attn.q_proj": "attention.wq.weight", "self_attn.q_proj": "attention.wq",
"self_attn.k_proj": "attention.wk.weight", "self_attn.k_proj": "attention.wk",
"self_attn.v_proj": "attention.wv.weight", "self_attn.v_proj": "attention.wv",
"self_attn.o_proj": "attention.wo.weight", "self_attn.o_proj": "attention.wo",
# "embed_tokens.weight": "tok_embeddings.weight", "mlp.gate_proj": "feed_forward.w1",
# "norm.weight": "norm.weight", "mlp.down_proj": "feed_forward.w2",
# "lm_head.weight": "output.weight", "mlp.up_proj": "feed_forward.w3",
# "mlp.gate_proj": "feed_forward.w1.weight", "input_layernorm": "attention_norm",
# "mlp.down_proj": "feed_forward.w2.weight", "post_attention_layernorm": "ffn_norm",
# "mlp.up_proj": "feed_forward.w3.weight", # "norm": "norm",
# "input_layernorm": "attention_norm.weight", # "embed_tokens": "tok_embeddings",
# "post_attention_layernorm": "ffn_norm.weight", # "lm_head": "output",
} }
@ -71,7 +71,9 @@ def translate_tensor_name(t):
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}") print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
sys.exit(1) sys.exit(1)
output_string = f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.lora{lora_type}" output_string = (
f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
)
return output_string return output_string
else: else:
print(f"Error: unrecognized tensor {t}") print(f"Error: unrecognized tensor {t}")
@ -138,16 +140,17 @@ with open(output_path, "wb") as fout:
write_file_header(fout, params) write_file_header(fout, params)
for k, v in model.items(): for k, v in model.items():
# since ggml doesn't always support other types for the second operand, if k.endswith("lora_A.weight"):
# the tensors are always converted and exported as f32 if v.dtype != torch.float16 and v.dtype != torch.float32:
v = v.float() v = v.float()
v = v.T
else:
v = v.float()
t = v.numpy() t = v.numpy()
if "lora_A" in k: tname = translate_tensor_name(k)
t = t.T print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
print( write_tensor_header(fout, tname, t.shape, t.dtype)
f"{k} => {translate_tensor_name(k)} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB"
)
write_tensor_header(fout, translate_tensor_name(k), t.shape, t.dtype)
t.tofile(fout) t.tofile(fout)
print(f"Converted {input_json} and {input_model} to {output_path}") print(f"Converted {input_json} and {input_model} to {output_path}")

5
ggml.c
View file

@ -5955,11 +5955,6 @@ static void ggml_compute_forward_add_q_f32(
GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3); GGML_ASSERT(nb2 <= nb3);
GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
GGML_ASSERT(ne2 == ne02);
GGML_ASSERT(ne3 == ne03);
GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1); GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1);
GGML_ASSERT(dst->type == src0->type); GGML_ASSERT(dst->type == src0->type);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);

View file

@ -617,6 +617,7 @@ struct llama_model_loader {
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
} }
return get_tensor_for(lt); return get_tensor_for(lt);
} }
@ -1799,7 +1800,8 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
// create a temporary ggml context to store the lora tensors // create a temporary ggml context to store the lora tensors
std::vector<uint8_t> buf(1024 * 1024 * 100); // todo: calculate size from biggest possible tensor
std::vector<uint8_t> buf(1024ull * 1024ull * 1024ull);
struct ggml_init_params params; struct ggml_init_params params;
params.mem_size = buf.size(); params.mem_size = buf.size();
params.mem_buffer = buf.data(); params.mem_buffer = buf.data();
@ -1830,11 +1832,9 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
break; break;
} }
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 }; int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) { for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i])); fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
} }
std::string name(length, 0); std::string name(length, 0);
@ -1903,24 +1903,26 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
} }
// w = w + BA*s // w = w + BA*s
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA); ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
if (scaling != 1.0f) { if (scaling != 1.0f) {
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
BA = ggml_scale(lora_ctx, BA, scale_tensor); BA = ggml_scale(lora_ctx, BA, scale_tensor);
} }
//printf("%s: (B)(%d %d %d %d) x (A)(%d %d %d %d) => (BA)(%d %d %d %d) + (T)(%d %d %d %d)\n",
// base_name.c_str(),
// (int)loraB->ne[0], (int)loraB->ne[1], (int)loraB->ne[2], (int)loraB->ne[3],
// (int)loraA->ne[0], (int)loraA->ne[1], (int)loraA->ne[2], (int)loraA->ne[3],
// (int)BA->ne[0], (int)BA->ne[1], (int)BA->ne[2], (int)BA->ne[3],
// (int)tensor->ne[0], (int)tensor->ne[1], (int)tensor->ne[2], (int)tensor->ne[3]
//);
ggml_tensor * r = ggml_add_inplace(lora_ctx, tensor, BA); ggml_tensor * r = ggml_add_inplace(lora_ctx, tensor, BA);
//ggml_tensor * r = ggml_add(lora_ctx, tensor, BA);
//r = ggml_cpy(lora_ctx, r, tensor);
struct ggml_cgraph gf = ggml_build_forward(r); struct ggml_cgraph gf = ggml_build_forward(r);
gf.n_threads = n_threads; gf.n_threads = n_threads;
ggml_graph_compute(lora_ctx, &gf); ggml_graph_compute(lora_ctx, &gf);
// hack until ggml_cpy supports quantized tensors
// memcpy(tensor->data, r->data, ggml_nbytes(tensor));
// we won't need these tensors again, reset the context to save memory // we won't need these tensors again, reset the context to save memory
ggml_free(lora_ctx); ggml_free(lora_ctx);
lora_ctx = ggml_init(params); lora_ctx = ggml_init(params);