falcon : support non-40B models

This commit is contained in:
Georgi Gerganov 2023-08-22 22:52:14 +03:00
parent 3c7c325b98
commit 2d58444dae
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 26 additions and 21 deletions

View file

@ -101,7 +101,10 @@ gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"]) gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
gguf_writer.add_block_count(block_count) gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"]) gguf_writer.add_head_count(hparams["n_head"])
if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"]) if "n_head_kv" in hparams:
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
else:
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
# TOKENIZATION # TOKENIZATION
@ -201,7 +204,7 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform # params for qkv transform
n_head = hparams["n_head"] n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else n_head n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
head_dim = hparams["hidden_size"] // n_head head_dim = hparams["hidden_size"] // n_head

View file

@ -1861,8 +1861,11 @@ static void llm_load_tensors(
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, GGML_BACKEND_CPU); layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, GGML_BACKEND_CPU);
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, GGML_BACKEND_CPU); layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, GGML_BACKEND_CPU);
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, GGML_BACKEND_CPU); layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, GGML_BACKEND_CPU);
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, GGML_BACKEND_CPU); layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, GGML_BACKEND_CPU);
}
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, GGML_BACKEND_CPU); layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, GGML_BACKEND_CPU);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, GGML_BACKEND_CPU); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, GGML_BACKEND_CPU);
@ -2421,17 +2424,17 @@ static struct ggml_cgraph * llm_build_falcon(
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * layernorm_output; struct ggml_tensor * attn_norm;
// self-attention // self-attention
{ {
layernorm_output = ggml_norm(ctx0, inpL); attn_norm = ggml_norm(ctx0, inpL);
layernorm_output = ggml_add(ctx0, attn_norm = ggml_add(ctx0,
ggml_mul(ctx0, ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].attn_norm, layernorm_output), ggml_repeat(ctx0, model.layers[il].attn_norm, attn_norm),
layernorm_output), attn_norm),
ggml_repeat(ctx0, model.layers[il].attn_norm_b, layernorm_output)); ggml_repeat(ctx0, model.layers[il].attn_norm_b, attn_norm));
if (hparams.n_head_kv == 8) { // Falcon-40B if (hparams.n_head_kv == 8) { // Falcon-40B
cur = ggml_norm(ctx0, inpL); cur = ggml_norm(ctx0, inpL);
@ -2441,9 +2444,8 @@ static struct ggml_cgraph * llm_build_falcon(
ggml_repeat(ctx0, model.layers[il].attn_norm_2, cur), ggml_repeat(ctx0, model.layers[il].attn_norm_2, cur),
cur), cur),
ggml_repeat(ctx0, model.layers[il].attn_norm_2_b, cur)); ggml_repeat(ctx0, model.layers[il].attn_norm_2_b, cur));
} } else { // Falcon 7B
else { // Falcon 7B cur = attn_norm;
cur = layernorm_output;
} }
// compute QKV // compute QKV
@ -2563,7 +2565,7 @@ static struct ggml_cgraph * llm_build_falcon(
} }
} }
struct ggml_tensor* inpFF = layernorm_output; struct ggml_tensor * inpFF = attn_norm;
struct ggml_tensor * attn_out = ggml_cpy( struct ggml_tensor * attn_out = ggml_cpy(
ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));