falcon : support non-40B models

This commit is contained in:
Georgi Gerganov 2023-08-22 22:52:14 +03:00
parent 3c7c325b98
commit 2d58444dae
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 26 additions and 21 deletions

View file

@ -101,7 +101,10 @@ gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"]) gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
gguf_writer.add_block_count(block_count) gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"]) gguf_writer.add_head_count(hparams["n_head"])
if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"]) if "n_head_kv" in hparams:
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
else:
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
# TOKENIZATION # TOKENIZATION
@ -201,7 +204,7 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform # params for qkv transform
n_head = hparams["n_head"] n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else n_head n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
head_dim = hparams["hidden_size"] // n_head head_dim = hparams["hidden_size"] // n_head

View file

@ -1859,10 +1859,13 @@ static void llm_load_tensors(
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i]; auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, GGML_BACKEND_CPU); layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, GGML_BACKEND_CPU);
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, GGML_BACKEND_CPU); layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, GGML_BACKEND_CPU);
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, GGML_BACKEND_CPU);
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, GGML_BACKEND_CPU); if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, GGML_BACKEND_CPU);
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, GGML_BACKEND_CPU);
}
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, GGML_BACKEND_CPU); layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, GGML_BACKEND_CPU);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, GGML_BACKEND_CPU); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, GGML_BACKEND_CPU);
@ -2421,19 +2424,19 @@ static struct ggml_cgraph * llm_build_falcon(
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * layernorm_output; struct ggml_tensor * attn_norm;
// self-attention // self-attention
{ {
layernorm_output = ggml_norm(ctx0, inpL); attn_norm = ggml_norm(ctx0, inpL);
layernorm_output = ggml_add(ctx0, attn_norm = ggml_add(ctx0,
ggml_mul(ctx0, ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].attn_norm, layernorm_output), ggml_repeat(ctx0, model.layers[il].attn_norm, attn_norm),
layernorm_output), attn_norm),
ggml_repeat(ctx0, model.layers[il].attn_norm_b, layernorm_output)); ggml_repeat(ctx0, model.layers[il].attn_norm_b, attn_norm));
if ( hparams.n_head_kv == 8 ) { // Falcon-40B if (hparams.n_head_kv == 8) { // Falcon-40B
cur = ggml_norm(ctx0, inpL); cur = ggml_norm(ctx0, inpL);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
@ -2441,9 +2444,8 @@ static struct ggml_cgraph * llm_build_falcon(
ggml_repeat(ctx0, model.layers[il].attn_norm_2, cur), ggml_repeat(ctx0, model.layers[il].attn_norm_2, cur),
cur), cur),
ggml_repeat(ctx0, model.layers[il].attn_norm_2_b, cur)); ggml_repeat(ctx0, model.layers[il].attn_norm_2_b, cur));
} } else { // Falcon 7B
else { // Falcon 7B cur = attn_norm;
cur = layernorm_output;
} }
// compute QKV // compute QKV
@ -2563,8 +2565,8 @@ static struct ggml_cgraph * llm_build_falcon(
} }
} }
struct ggml_tensor* inpFF = layernorm_output; struct ggml_tensor * inpFF = attn_norm;
struct ggml_tensor* attn_out = ggml_cpy( struct ggml_tensor * attn_out = ggml_cpy(
ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
{ {
@ -2607,7 +2609,7 @@ static struct ggml_cgraph * llama_build_graph(
const float * embd, const float * embd,
int n_tokens, int n_tokens,
int n_past) { int n_past) {
const auto & model = lctx.model; const auto & model = lctx.model;
struct ggml_cgraph * result = NULL; struct ggml_cgraph * result = NULL;
@ -2669,8 +2671,8 @@ static bool llama_eval_internal(
GGML_ASSERT(!!kv_self.ctx); GGML_ASSERT(!!kv_self.ctx);
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
const int64_t n_vocab = hparams.n_vocab; const int64_t n_vocab = hparams.n_vocab;
ggml_allocr_reset(lctx.alloc); ggml_allocr_reset(lctx.alloc);