llama : de-shadow (cont) [no ci]
This commit is contained in:
parent
0127774ae4
commit
32e7b9dc99
2 changed files with 34 additions and 30 deletions
|
@ -24,25 +24,30 @@
|
||||||
struct naive_trie {
|
struct naive_trie {
|
||||||
naive_trie() : has_value(false), value(0) {
|
naive_trie() : has_value(false), value(0) {
|
||||||
}
|
}
|
||||||
void insert(const char * key, size_t len, int32_t value = 0) {
|
|
||||||
|
void insert(const char * key, size_t len, int32_t val = 0) {
|
||||||
if (len == 0) {
|
if (len == 0) {
|
||||||
this->has_value = true;
|
has_value = true;
|
||||||
this->value = value;
|
value = val;
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
char c = key[0];
|
char c = key[0];
|
||||||
auto res = children.find(c);
|
auto res = children.find(c);
|
||||||
if (res != children.end()) {
|
if (res != children.end()) {
|
||||||
res->second.insert(key + 1, len - 1, value);
|
res->second.insert(key + 1, len - 1, val);
|
||||||
} else {
|
} else {
|
||||||
auto res = children.insert(std::make_pair(c, naive_trie()));
|
auto res = children.insert(std::make_pair(c, naive_trie()));
|
||||||
res.first->second.insert(key + 1, len - 1, value);
|
res.first->second.insert(key + 1, len - 1, val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
|
std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
|
||||||
if (len == 0 || offset == len) {
|
if (len == 0 || offset == len) {
|
||||||
return std::make_pair(key, offset);
|
return std::make_pair(key, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
char c = key[offset];
|
char c = key[offset];
|
||||||
auto res = children.find(c);
|
auto res = children.find(c);
|
||||||
if (res != children.end()) {
|
if (res != children.end()) {
|
||||||
|
@ -51,6 +56,7 @@ struct naive_trie {
|
||||||
|
|
||||||
return std::make_pair(key, offset);
|
return std::make_pair(key, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
const struct naive_trie * traverse(const char c) const {
|
const struct naive_trie * traverse(const char c) const {
|
||||||
auto res = children.find(c);
|
auto res = children.find(c);
|
||||||
if (res != children.end()) {
|
if (res != children.end()) {
|
||||||
|
@ -59,6 +65,7 @@ struct naive_trie {
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::map<char, struct naive_trie> children;
|
std::map<char, struct naive_trie> children;
|
||||||
bool has_value;
|
bool has_value;
|
||||||
llama_token value;
|
llama_token value;
|
||||||
|
|
|
@ -1656,10 +1656,10 @@ struct llm_build_context {
|
||||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
const int64_t n_head_kv_i = hparams.n_head_kv(il);
|
||||||
const int64_t n_head = hparams.n_head(il);
|
const int64_t n_head_i = hparams.n_head(il);
|
||||||
|
|
||||||
if (n_head == 0) {
|
if (n_head_i == 0) {
|
||||||
// attention-free layer of Llama-3_1-Nemotron-51B
|
// attention-free layer of Llama-3_1-Nemotron-51B
|
||||||
cur = inpL;
|
cur = inpL;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1670,11 +1670,11 @@ struct llm_build_context {
|
||||||
cb(cur, "attn_norm", il);
|
cb(cur, "attn_norm", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_head > 0 && n_head_kv == 0) {
|
if (n_head_i > 0 && n_head_kv_i == 0) {
|
||||||
// "linear attention" of Llama-3_1-Nemotron-51B
|
// "linear attention" of Llama-3_1-Nemotron-51B
|
||||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
||||||
cb(cur, "wo", il);
|
cb(cur, "wo", il);
|
||||||
} else if (n_head > 0) {
|
} else if (n_head_i > 0) {
|
||||||
// self-attention
|
// self-attention
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||||
|
@ -1702,14 +1702,14 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head_i, n_tokens), inp_pos, rope_factors,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_ext(
|
Kcur = ggml_rope_ext(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv_i, n_tokens), inp_pos, rope_factors,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
|
@ -1734,7 +1734,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
||||||
struct ggml_tensor * ffn_inp = cur;
|
struct ggml_tensor * ffn_inp = cur;
|
||||||
if (n_head > 0) {
|
if (n_head_i > 0) {
|
||||||
ffn_inp = ggml_add(ctx0, cur, inpSA);
|
ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
cb(ffn_inp, "ffn_inp", il);
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
}
|
}
|
||||||
|
@ -2643,7 +2643,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
// iterate layers
|
// iterate layers
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * cur = inpL;
|
cur = inpL;
|
||||||
|
|
||||||
struct ggml_tensor * Qcur;
|
struct ggml_tensor * Qcur;
|
||||||
struct ggml_tensor * Kcur;
|
struct ggml_tensor * Kcur;
|
||||||
|
@ -4717,8 +4717,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_gemma() {
|
struct ggml_cgraph * build_gemma() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
@ -4825,8 +4823,6 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * build_gemma2() {
|
struct ggml_cgraph * build_gemma2() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
@ -4962,6 +4958,7 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
@ -5800,9 +5797,9 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
const int64_t n_head = hparams.n_head(il);
|
const int64_t n_head_i = hparams.n_head(il);
|
||||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
const int64_t n_head_kv_i = hparams.n_head_kv(il);
|
||||||
const int64_t n_head_qkv = 2*n_head_kv + n_head;
|
const int64_t n_head_qkv_i = 2*n_head_kv_i + n_head_i;
|
||||||
|
|
||||||
cur = inpL;
|
cur = inpL;
|
||||||
struct ggml_tensor * residual = cur;
|
struct ggml_tensor * residual = cur;
|
||||||
|
@ -5818,15 +5815,15 @@ struct llm_build_context {
|
||||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
||||||
cb(cur, "wqkv", il);
|
cb(cur, "wqkv", il);
|
||||||
|
|
||||||
cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
|
cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv_i, n_tokens);
|
||||||
|
|
||||||
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
|
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_i, n_tokens, cur->nb[1], cur->nb[2], 0));
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
|
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head_i));
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
|
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head_i+n_head_kv_i)));
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
||||||
|
@ -5851,7 +5848,7 @@ struct llm_build_context {
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
|
Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv_i, n_tokens);
|
||||||
cb(Qcur, "Vcur", il);
|
cb(Qcur, "Vcur", il);
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||||
|
@ -7497,7 +7494,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
const int64_t n_seqs = ubatch.n_seqs;
|
const int64_t n_seqs = ubatch.n_seqs;
|
||||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||||
const int64_t n_tokens = ubatch.n_tokens;
|
|
||||||
GGML_ASSERT(n_seqs != 0);
|
GGML_ASSERT(n_seqs != 0);
|
||||||
GGML_ASSERT(ubatch.equal_seqs);
|
GGML_ASSERT(ubatch.equal_seqs);
|
||||||
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
|
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
|
||||||
|
@ -7610,7 +7607,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
const int64_t n_seqs = ubatch.n_seqs;
|
const int64_t n_seqs = ubatch.n_seqs;
|
||||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||||
const int64_t n_tokens = ubatch.n_tokens;
|
|
||||||
GGML_ASSERT(n_seqs != 0);
|
GGML_ASSERT(n_seqs != 0);
|
||||||
GGML_ASSERT(ubatch.equal_seqs);
|
GGML_ASSERT(ubatch.equal_seqs);
|
||||||
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
|
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue