From de64f091c8b0ab1df364f93a2a0396d112f55692 Mon Sep 17 00:00:00 2001 From: akawrykow Date: Tue, 29 Aug 2023 15:13:04 -0700 Subject: [PATCH] Skip qkv reshaping for non-parallel attention --- convert-falcon-hf-to-gguf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index 5948fc788..c7fe849f0 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -206,6 +206,7 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count) # params for qkv transform head_dim = hparams["hidden_size"] // n_head +parallel_attn = hparams["parallel_attn"] # tensor info print("gguf: get tensor metadata") @@ -240,7 +241,7 @@ for part_name in part_names: # in contiguous fashion. # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - if "query_key_value" in name: + if "query_key_value" in name and parallel_attn: qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head) k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)