falcon : support non-40B models
This commit is contained in:
parent
3c7c325b98
commit
2d58444dae
2 changed files with 26 additions and 21 deletions
|
@ -101,7 +101,10 @@ gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|||
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_head_count(hparams["n_head"])
|
||||
if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"])
|
||||
if "n_head_kv" in hparams:
|
||||
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
|
||||
else:
|
||||
gguf_writer.add_head_count_kv(1)
|
||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||
|
||||
# TOKENIZATION
|
||||
|
@ -201,7 +204,7 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
|||
|
||||
# params for qkv transform
|
||||
n_head = hparams["n_head"]
|
||||
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else n_head
|
||||
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
||||
|
||||
head_dim = hparams["hidden_size"] // n_head
|
||||
|
||||
|
|
40
llama.cpp
40
llama.cpp
|
@ -1859,10 +1859,13 @@ static void llm_load_tensors(
|
|||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, GGML_BACKEND_CPU);
|
||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, GGML_BACKEND_CPU);
|
||||
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, GGML_BACKEND_CPU);
|
||||
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, GGML_BACKEND_CPU);
|
||||
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, GGML_BACKEND_CPU);
|
||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, GGML_BACKEND_CPU);
|
||||
|
||||
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
||||
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, GGML_BACKEND_CPU);
|
||||
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, GGML_BACKEND_CPU);
|
||||
}
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, GGML_BACKEND_CPU);
|
||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, GGML_BACKEND_CPU);
|
||||
|
@ -2421,19 +2424,19 @@ static struct ggml_cgraph * llm_build_falcon(
|
|||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * layernorm_output;
|
||||
struct ggml_tensor * attn_norm;
|
||||
|
||||
// self-attention
|
||||
{
|
||||
layernorm_output = ggml_norm(ctx0, inpL);
|
||||
attn_norm = ggml_norm(ctx0, inpL);
|
||||
|
||||
layernorm_output = ggml_add(ctx0,
|
||||
attn_norm = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].attn_norm, layernorm_output),
|
||||
layernorm_output),
|
||||
ggml_repeat(ctx0, model.layers[il].attn_norm_b, layernorm_output));
|
||||
ggml_repeat(ctx0, model.layers[il].attn_norm, attn_norm),
|
||||
attn_norm),
|
||||
ggml_repeat(ctx0, model.layers[il].attn_norm_b, attn_norm));
|
||||
|
||||
if ( hparams.n_head_kv == 8 ) { // Falcon-40B
|
||||
if (hparams.n_head_kv == 8) { // Falcon-40B
|
||||
cur = ggml_norm(ctx0, inpL);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
|
@ -2441,9 +2444,8 @@ static struct ggml_cgraph * llm_build_falcon(
|
|||
ggml_repeat(ctx0, model.layers[il].attn_norm_2, cur),
|
||||
cur),
|
||||
ggml_repeat(ctx0, model.layers[il].attn_norm_2_b, cur));
|
||||
}
|
||||
else { // Falcon 7B
|
||||
cur = layernorm_output;
|
||||
} else { // Falcon 7B
|
||||
cur = attn_norm;
|
||||
}
|
||||
|
||||
// compute QKV
|
||||
|
@ -2563,8 +2565,8 @@ static struct ggml_cgraph * llm_build_falcon(
|
|||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor* inpFF = layernorm_output;
|
||||
struct ggml_tensor* attn_out = ggml_cpy(
|
||||
struct ggml_tensor * inpFF = attn_norm;
|
||||
struct ggml_tensor * attn_out = ggml_cpy(
|
||||
ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||
|
||||
{
|
||||
|
@ -2607,7 +2609,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
const float * embd,
|
||||
int n_tokens,
|
||||
int n_past) {
|
||||
const auto & model = lctx.model;
|
||||
const auto & model = lctx.model;
|
||||
|
||||
struct ggml_cgraph * result = NULL;
|
||||
|
||||
|
@ -2669,8 +2671,8 @@ static bool llama_eval_internal(
|
|||
|
||||
GGML_ASSERT(!!kv_self.ctx);
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_vocab = hparams.n_vocab;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_vocab = hparams.n_vocab;
|
||||
|
||||
ggml_allocr_reset(lctx.alloc);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue