llama : de-shadow (wip) [no ci]
This commit is contained in:
parent
168324a388
commit
0bebe45a25
3 changed files with 31 additions and 29 deletions
|
@ -204,6 +204,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
||||||
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
|
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
|
||||||
|
|
||||||
// print first 10 elements
|
// print first 10 elements
|
||||||
|
{
|
||||||
const float * data = (const float *) cur->data;
|
const float * data = (const float *) cur->data;
|
||||||
|
|
||||||
printf("%s data[:10] : ", name);
|
printf("%s data[:10] : ", name);
|
||||||
|
@ -211,6 +212,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
||||||
printf("%f ", data[j]);
|
printf("%f ", data[j]);
|
||||||
}
|
}
|
||||||
printf("\n\n");
|
printf("\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
// check data
|
// check data
|
||||||
if (check_data) {
|
if (check_data) {
|
||||||
|
|
|
@ -58,12 +58,12 @@ struct llama_kv_cache {
|
||||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||||
|
|
||||||
size_t total_size() const {
|
size_t total_size() const {
|
||||||
size_t size = 0;
|
size_t size_all = 0;
|
||||||
for (const auto & buf : bufs) {
|
for (const auto & buf : bufs) {
|
||||||
size += ggml_backend_buffer_get_size(buf.get());
|
size_all += ggml_backend_buffer_get_size(buf.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
return size;
|
return size_all;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: better data structures to reduce the cost of this operation
|
// TODO: better data structures to reduce the cost of this operation
|
||||||
|
|
|
@ -1174,14 +1174,15 @@ struct llm_build_context {
|
||||||
ggml_set_input(lctx.inp_K_shift);
|
ggml_set_input(lctx.inp_K_shift);
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
const int64_t n_head_kv_i = hparams.n_head_kv(il);
|
||||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il);
|
||||||
|
|
||||||
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||||
struct ggml_tensor * k =
|
struct ggml_tensor * k =
|
||||||
ggml_view_3d(ctx0, kv_self.k_l[il],
|
ggml_view_3d(ctx0, kv_self.k_l[il],
|
||||||
n_embd_head_k, n_head_kv, n_ctx,
|
n_embd_head_k, n_head_kv_i, n_ctx,
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
|
||||||
0);
|
0);
|
||||||
|
|
||||||
struct ggml_tensor * tmp;
|
struct ggml_tensor * tmp;
|
||||||
|
@ -1231,18 +1232,18 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il);
|
||||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(il);
|
||||||
|
|
||||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||||
n_embd_k_gqa, nm,
|
n_embd_k_gqa_i, nm,
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*i));
|
||||||
|
|
||||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||||
n_embd_k_gqa, nm,
|
n_embd_k_gqa_i, nm,
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*id));
|
||||||
|
|
||||||
ggml_tensor * view_v_src;
|
ggml_tensor * view_v_src;
|
||||||
ggml_tensor * view_v_dst;
|
ggml_tensor * view_v_dst;
|
||||||
|
@ -1250,22 +1251,22 @@ struct llm_build_context {
|
||||||
if (flash_attn) {
|
if (flash_attn) {
|
||||||
// NOTE: the V cache is not transposed when using flash attention
|
// NOTE: the V cache is not transposed when using flash attention
|
||||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||||
n_embd_v_gqa, nm,
|
n_embd_v_gqa_i, nm,
|
||||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i),
|
||||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*i));
|
||||||
|
|
||||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||||
n_embd_v_gqa, nm,
|
n_embd_v_gqa_i, nm,
|
||||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i),
|
||||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*id));
|
||||||
} else {
|
} else {
|
||||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||||
nm, n_embd_v_gqa,
|
nm, n_embd_v_gqa_i,
|
||||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||||
ggml_row_size(kv_self.v_l[il]->type, i));
|
ggml_row_size(kv_self.v_l[il]->type, i));
|
||||||
|
|
||||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||||
nm, n_embd_v_gqa,
|
nm, n_embd_v_gqa_i,
|
||||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||||
ggml_row_size(kv_self.v_l[il]->type, id));
|
ggml_row_size(kv_self.v_l[il]->type, id));
|
||||||
}
|
}
|
||||||
|
@ -1459,7 +1460,6 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * llm_build_inp_embd_enc() {
|
struct ggml_tensor * llm_build_inp_embd_enc() {
|
||||||
const int64_t n_embd = hparams.n_embd;
|
|
||||||
lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
|
lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
|
||||||
ggml_set_input(lctx.inp_embd_enc);
|
ggml_set_input(lctx.inp_embd_enc);
|
||||||
cb(lctx.inp_embd_enc, "embd_enc", -1);
|
cb(lctx.inp_embd_enc, "embd_enc", -1);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue