llama : de-shadow (wip) [no ci]

This commit is contained in:
Georgi Gerganov 2025-01-12 12:15:19 +02:00
parent 168324a388
commit 0bebe45a25
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 31 additions and 29 deletions

View file

@ -204,13 +204,15 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data); __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
// print first 10 elements // print first 10 elements
const float * data = (const float *) cur->data; {
const float * data = (const float *) cur->data;
printf("%s data[:10] : ", name); printf("%s data[:10] : ", name);
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) { for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
printf("%f ", data[j]); printf("%f ", data[j]);
}
printf("\n\n");
} }
printf("\n\n");
// check data // check data
if (check_data) { if (check_data) {

View file

@ -58,12 +58,12 @@ struct llama_kv_cache {
std::vector<ggml_backend_buffer_ptr> bufs; std::vector<ggml_backend_buffer_ptr> bufs;
size_t total_size() const { size_t total_size() const {
size_t size = 0; size_t size_all = 0;
for (const auto & buf : bufs) { for (const auto & buf : bufs) {
size += ggml_backend_buffer_get_size(buf.get()); size_all += ggml_backend_buffer_get_size(buf.get());
} }
return size; return size_all;
} }
// TODO: better data structures to reduce the cost of this operation // TODO: better data structures to reduce the cost of this operation

View file

@ -1174,14 +1174,15 @@ struct llm_build_context {
ggml_set_input(lctx.inp_K_shift); ggml_set_input(lctx.inp_K_shift);
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
const int64_t n_head_kv = hparams.n_head_kv(il); const int64_t n_head_kv_i = hparams.n_head_kv(il);
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il);
struct ggml_tensor * rope_factors = build_rope_factors(il); struct ggml_tensor * rope_factors = build_rope_factors(il);
struct ggml_tensor * k = struct ggml_tensor * k =
ggml_view_3d(ctx0, kv_self.k_l[il], ggml_view_3d(ctx0, kv_self.k_l[il],
n_embd_head_k, n_head_kv, n_ctx, n_embd_head_k, n_head_kv_i, n_ctx,
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
0); 0);
struct ggml_tensor * tmp; struct ggml_tensor * tmp;
@ -1231,18 +1232,18 @@ struct llm_build_context {
} }
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(il);
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
n_embd_k_gqa, nm, n_embd_k_gqa_i, nm,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*i));
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
n_embd_k_gqa, nm, n_embd_k_gqa_i, nm,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*id));
ggml_tensor * view_v_src; ggml_tensor * view_v_src;
ggml_tensor * view_v_dst; ggml_tensor * view_v_dst;
@ -1250,22 +1251,22 @@ struct llm_build_context {
if (flash_attn) { if (flash_attn) {
// NOTE: the V cache is not transposed when using flash attention // NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
n_embd_v_gqa, nm, n_embd_v_gqa_i, nm,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i),
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*i));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
n_embd_v_gqa, nm, n_embd_v_gqa_i, nm,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i),
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*id));
} else { } else {
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
nm, n_embd_v_gqa, nm, n_embd_v_gqa_i,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size), ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
ggml_row_size(kv_self.v_l[il]->type, i)); ggml_row_size(kv_self.v_l[il]->type, i));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
nm, n_embd_v_gqa, nm, n_embd_v_gqa_i,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size), ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
ggml_row_size(kv_self.v_l[il]->type, id)); ggml_row_size(kv_self.v_l[il]->type, id));
} }
@ -1459,7 +1460,6 @@ struct llm_build_context {
} }
struct ggml_tensor * llm_build_inp_embd_enc() { struct ggml_tensor * llm_build_inp_embd_enc() {
const int64_t n_embd = hparams.n_embd;
lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc); lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
ggml_set_input(lctx.inp_embd_enc); ggml_set_input(lctx.inp_embd_enc);
cb(lctx.inp_embd_enc, "embd_enc", -1); cb(lctx.inp_embd_enc, "embd_enc", -1);