From 0c7e21d7b2de78f2ff17ed279a481d021c181ac0 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 29 Mar 2024 19:10:20 +0100 Subject: [PATCH] ggml : update mul_mat_id to use the same tensor for all the experts --- ggml.c | 58 +++++++++++++++++++++++++------------------------- ggml.h | 3 +-- llama.cpp | 63 +++++++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 80 insertions(+), 44 deletions(-) diff --git a/ggml.c b/ggml.c index 7471e7926..e212d060b 100644 --- a/ggml.c +++ b/ggml.c @@ -4575,8 +4575,7 @@ void ggml_mul_mat_set_prec( struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, - struct ggml_tensor * const as[], - int n_as, + struct ggml_tensor * as, struct ggml_tensor * ids, int id, struct ggml_tensor * b) { @@ -4585,33 +4584,34 @@ struct ggml_tensor * ggml_mul_mat_id( GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); GGML_ASSERT(ids->ne[1] == b->ne[1]); GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]); - GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2); + //GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2); GGML_ASSERT(id >= 0 && id < ids->ne[0]); + // TODO: restore checks bool is_node = false; - if (as[0]->grad || b->grad) { + if (as->grad || b->grad) { is_node = true; } - const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] }; + const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); ggml_set_op_params_i32(result, 0, id); - ggml_set_op_params_i32(result, 1, n_as); result->op = GGML_OP_MUL_MAT_ID; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = ids; + result->src[0] = as; result->src[1] = b; + result->src[2] = ids; - for (int i = 0; i < n_as; i++) { - struct ggml_tensor * a = as[i]; - GGML_ASSERT(ggml_are_same_shape(as[0], a)); - GGML_ASSERT(ggml_can_mul_mat(a, b)); - GGML_ASSERT(!ggml_is_transposed(a)); - result->src[i + 2] = a; - } + //for (int i = 0; i < n_as; i++) { + // struct ggml_tensor * a = as[i]; + // GGML_ASSERT(ggml_are_same_shape(as[0], a)); + // GGML_ASSERT(ggml_can_mul_mat(a, b)); + // GGML_ASSERT(!ggml_is_transposed(a)); + // result->src[i + 2] = a; + //} return result; } @@ -10948,10 +10948,9 @@ static void ggml_compute_forward_mul_mat_id( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - const struct ggml_tensor * ids = dst->src[0]; + const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - - const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS + const struct ggml_tensor * ids = dst->src[2]; GGML_TENSOR_BINARY_OP_LOCALS @@ -10982,12 +10981,15 @@ static void ggml_compute_forward_mul_mat_id( GGML_ASSERT(nb2 <= nb3); // broadcast factors - const int64_t r2 = ne12/ne02; - const int64_t r3 = ne13/ne03; + //const int64_t r2 = ne12/ne02; + //const int64_t r3 = ne13/ne03; + // broadcast is not supported with mmid + assert(ne12 == 1); + assert(ne13 == 1); // row groups const int id = ggml_get_op_params_i32(dst, 0); - const int n_as = ggml_get_op_params_i32(dst, 1); + const int n_as = src0->ne[2]; //ggml_get_op_params_i32(dst, 1); char * wdata_src1_end = (src1->type == vec_dot_type) ? (char *) params->wdata : @@ -11047,7 +11049,8 @@ static void ggml_compute_forward_mul_mat_id( continue; } - const struct ggml_tensor * src0_cur = dst->src[cur_a + 2]; + //const struct ggml_tensor * src0_cur = dst->src[cur_a + 2]; + size_t src0_offset = src0->nb[2]*cur_a; const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); @@ -11082,9 +11085,6 @@ static void ggml_compute_forward_mul_mat_id( continue; } - assert(ne12 % ne02 == 0); - assert(ne13 % ne03 == 0); - // block-tiling attempt const int64_t blck_0 = 16; const int64_t blck_1 = 16; @@ -11101,14 +11101,14 @@ static void ggml_compute_forward_mul_mat_id( const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11); // broadcast src0 into src1 - const int64_t i03 = i13/r3; - const int64_t i02 = i12/r2; + //const int64_t i03 = i13/r3; + //const int64_t i02 = i12/r2; const int64_t i1 = i11; const int64_t i2 = i12; const int64_t i3 = i13; - const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03); + const char * src0_row = (const char *) src0->data + src0_offset; // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using @@ -18464,13 +18464,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa case GGML_OP_MUL_MAT_ID: { cur = 0; - const struct ggml_tensor * src0 = node->src[2]; + const struct ggml_tensor * src0 = node->src[0]; const struct ggml_tensor * src1 = node->src[1]; const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type; if (src1->type != vec_dot_type) { cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); } - const int n_as = ggml_get_op_params_i32(node, 1); + const int n_as = src0->ne[2]; cur += GGML_PAD(cur, sizeof(int64_t)); // align cur += n_as * sizeof(int64_t); // matrix_row_counts cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows diff --git a/ggml.h b/ggml.h index 5d4a4ceb6..5cef45c0b 100644 --- a/ggml.h +++ b/ggml.h @@ -1164,8 +1164,7 @@ extern "C" { // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b) GGML_API struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, - struct ggml_tensor * const as[], - int n_as, + struct ggml_tensor * as, struct ggml_tensor * ids, int id, struct ggml_tensor * b); diff --git a/llama.cpp b/llama.cpp index 21e7a067a..6abf9f354 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1864,6 +1864,10 @@ struct llama_layer { // ff MoE struct ggml_tensor * ffn_gate_inp; + struct ggml_tensor * ffn_gate_exps;//[LLAMA_MAX_EXPERTS]; + struct ggml_tensor * ffn_down_exps;//[LLAMA_MAX_EXPERTS]; + struct ggml_tensor * ffn_up_exps ;//[LLAMA_MAX_EXPERTS]; + struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS]; struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS]; struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS]; @@ -3170,7 +3174,7 @@ struct llama_model_loader { return weight; } } - throw std::runtime_error(format("tensor %s not found", name)); + throw std::runtime_error(format("tensor '%s' not found", name)); } struct ggml_tensor * get_tensor_meta(const char * name) const { @@ -3260,6 +3264,10 @@ struct llama_model_loader { *last = 0; *addr = mapping->addr; for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { + // hack to skip moe merged tensor + if (strlen(ggml_get_name(tensor)) == 0) { + continue; + } const auto & w = get_weights(ggml_get_name(tensor)); if (w.idx != idx) { continue; @@ -3304,6 +3312,11 @@ struct llama_model_loader { std::vector> read_buf; for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { + // hack to skip moe merged tensor + if (strlen(ggml_get_name(cur)) == 0) { + continue; + } + if (progress_callback) { if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { return false; @@ -4358,6 +4371,10 @@ static bool llm_load_tensors( // create one context per buffer type size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output + + // hack for moe merged tensors + ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer; + std::map ctx_map; for (auto & it : buft_layer_count) { struct ggml_init_params params = { @@ -4451,11 +4468,30 @@ static bool llm_load_tensors( GGML_ASSERT(hparams.n_expert > 0); GGML_ASSERT(hparams.n_expert_used > 0); + // hack to merge tensors, need to clean this up + // merged tensors + ggml_type type = ml.get_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type; + layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type, n_embd, n_ff, hparams.n_expert); + layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type, n_ff, n_embd, hparams.n_expert); + layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type, n_embd, n_ff, hparams.n_expert); + // MoE branch for (uint32_t x = 0; x < hparams.n_expert; ++x) { - layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}); - layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}); - layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}); + // hack + // individual tensors as views + layer.ffn_gate_exp[x] = ggml_view_2d(ctx_split, layer.ffn_gate_exps, n_embd, n_ff, layer.ffn_gate_exps->nb[1], layer.ffn_gate_exps->nb[2]*x); + layer.ffn_down_exp[x] = ggml_view_2d(ctx_split, layer.ffn_down_exps, n_ff, n_embd, layer.ffn_down_exps->nb[1], layer.ffn_down_exps->nb[2]*x); + layer.ffn_up_exp[x] = ggml_view_2d(ctx_split, layer.ffn_up_exps, n_embd, n_ff, layer.ffn_up_exps->nb[1], layer.ffn_up_exps->nb[2]*x); + + ggml_set_name(layer.ffn_gate_exp[x], tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x).c_str()); + ggml_set_name(layer.ffn_down_exp[x], tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x).c_str()); + ggml_set_name(layer.ffn_up_exp[x], tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x).c_str()); + + ml.n_created += 3; // hack + + //layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}); + //layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}); + //layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}); } } } @@ -4500,9 +4536,10 @@ static bool llm_load_tensors( // MoE branch for (uint32_t x = 0; x < hparams.n_expert; ++x) { - layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}); - layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}); - layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}); + GGML_ASSERT(!"not implemented"); + //layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}); + //layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}); + //layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}); } layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); @@ -6284,10 +6321,10 @@ struct llm_build_context { for (int i = 0; i < n_expert_used; ++i) { ggml_tensor * cur_expert; - ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur); + ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur); cb(cur_up, "ffn_moe_up", il); - ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur); + ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur); cb(cur_gate, "ffn_moe_gate", il); cur_gate = ggml_silu(ctx0, cur_gate); @@ -6296,7 +6333,7 @@ struct llm_build_context { cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd] cb(cur_expert, "ffn_moe_gate_par", il); - cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd] + cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd] cb(cur_expert, "ffn_moe_down", il); cur_expert = ggml_mul(ctx0, cur_expert, @@ -6818,10 +6855,10 @@ struct llm_build_context { for (int i = 0; i < n_expert_used; ++i) { ggml_tensor * cur_expert; - ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur); + ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur); cb(cur_up, "ffn_moe_up", il); - ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur); + ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur); cb(cur_gate, "ffn_moe_gate", il); //GeLU @@ -6831,7 +6868,7 @@ struct llm_build_context { cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd] cb(cur_expert, "ffn_moe_gate_par", il); - cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd] + cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd] cb(cur_expert, "ffn_moe_down", il); cur_expert = ggml_mul(ctx0, cur_expert,