falcon : support non-40B models
This commit is contained in:
parent
3c7c325b98
commit
2d58444dae
2 changed files with 26 additions and 21 deletions
|
@ -101,7 +101,10 @@ gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
||||||
gguf_writer.add_block_count(block_count)
|
gguf_writer.add_block_count(block_count)
|
||||||
gguf_writer.add_head_count(hparams["n_head"])
|
gguf_writer.add_head_count(hparams["n_head"])
|
||||||
if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"])
|
if "n_head_kv" in hparams:
|
||||||
|
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
|
||||||
|
else:
|
||||||
|
gguf_writer.add_head_count_kv(1)
|
||||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||||
|
|
||||||
# TOKENIZATION
|
# TOKENIZATION
|
||||||
|
@ -201,7 +204,7 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||||
|
|
||||||
# params for qkv transform
|
# params for qkv transform
|
||||||
n_head = hparams["n_head"]
|
n_head = hparams["n_head"]
|
||||||
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else n_head
|
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
||||||
|
|
||||||
head_dim = hparams["hidden_size"] // n_head
|
head_dim = hparams["hidden_size"] // n_head
|
||||||
|
|
||||||
|
|
40
llama.cpp
40
llama.cpp
|
@ -1859,10 +1859,13 @@ static void llm_load_tensors(
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
auto & layer = model.layers[i];
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, GGML_BACKEND_CPU);
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, GGML_BACKEND_CPU);
|
||||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, GGML_BACKEND_CPU);
|
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, GGML_BACKEND_CPU);
|
||||||
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, GGML_BACKEND_CPU);
|
|
||||||
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, GGML_BACKEND_CPU);
|
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
||||||
|
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, GGML_BACKEND_CPU);
|
||||||
|
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, GGML_BACKEND_CPU);
|
||||||
|
}
|
||||||
|
|
||||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, GGML_BACKEND_CPU);
|
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, GGML_BACKEND_CPU);
|
||||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, GGML_BACKEND_CPU);
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, GGML_BACKEND_CPU);
|
||||||
|
@ -2421,19 +2424,19 @@ static struct ggml_cgraph * llm_build_falcon(
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * layernorm_output;
|
struct ggml_tensor * attn_norm;
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
layernorm_output = ggml_norm(ctx0, inpL);
|
attn_norm = ggml_norm(ctx0, inpL);
|
||||||
|
|
||||||
layernorm_output = ggml_add(ctx0,
|
attn_norm = ggml_add(ctx0,
|
||||||
ggml_mul(ctx0,
|
ggml_mul(ctx0,
|
||||||
ggml_repeat(ctx0, model.layers[il].attn_norm, layernorm_output),
|
ggml_repeat(ctx0, model.layers[il].attn_norm, attn_norm),
|
||||||
layernorm_output),
|
attn_norm),
|
||||||
ggml_repeat(ctx0, model.layers[il].attn_norm_b, layernorm_output));
|
ggml_repeat(ctx0, model.layers[il].attn_norm_b, attn_norm));
|
||||||
|
|
||||||
if ( hparams.n_head_kv == 8 ) { // Falcon-40B
|
if (hparams.n_head_kv == 8) { // Falcon-40B
|
||||||
cur = ggml_norm(ctx0, inpL);
|
cur = ggml_norm(ctx0, inpL);
|
||||||
|
|
||||||
cur = ggml_add(ctx0,
|
cur = ggml_add(ctx0,
|
||||||
|
@ -2441,9 +2444,8 @@ static struct ggml_cgraph * llm_build_falcon(
|
||||||
ggml_repeat(ctx0, model.layers[il].attn_norm_2, cur),
|
ggml_repeat(ctx0, model.layers[il].attn_norm_2, cur),
|
||||||
cur),
|
cur),
|
||||||
ggml_repeat(ctx0, model.layers[il].attn_norm_2_b, cur));
|
ggml_repeat(ctx0, model.layers[il].attn_norm_2_b, cur));
|
||||||
}
|
} else { // Falcon 7B
|
||||||
else { // Falcon 7B
|
cur = attn_norm;
|
||||||
cur = layernorm_output;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// compute QKV
|
// compute QKV
|
||||||
|
@ -2563,8 +2565,8 @@ static struct ggml_cgraph * llm_build_falcon(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor* inpFF = layernorm_output;
|
struct ggml_tensor * inpFF = attn_norm;
|
||||||
struct ggml_tensor* attn_out = ggml_cpy(
|
struct ggml_tensor * attn_out = ggml_cpy(
|
||||||
ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -2607,7 +2609,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
const float * embd,
|
const float * embd,
|
||||||
int n_tokens,
|
int n_tokens,
|
||||||
int n_past) {
|
int n_past) {
|
||||||
const auto & model = lctx.model;
|
const auto & model = lctx.model;
|
||||||
|
|
||||||
struct ggml_cgraph * result = NULL;
|
struct ggml_cgraph * result = NULL;
|
||||||
|
|
||||||
|
@ -2669,8 +2671,8 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
GGML_ASSERT(!!kv_self.ctx);
|
GGML_ASSERT(!!kv_self.ctx);
|
||||||
|
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
const int64_t n_vocab = hparams.n_vocab;
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
ggml_allocr_reset(lctx.alloc);
|
ggml_allocr_reset(lctx.alloc);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue