ggml : update mul_mat_id to use the same tensor for all the experts

This commit is contained in:
slaren 2024-03-29 19:10:20 +01:00
parent ba0c7c70ab
commit 0c7e21d7b2
3 changed files with 80 additions and 44 deletions

58
ggml.c
View file

@ -4575,8 +4575,7 @@ void ggml_mul_mat_set_prec(
struct ggml_tensor * ggml_mul_mat_id( struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * const as[], struct ggml_tensor * as,
int n_as,
struct ggml_tensor * ids, struct ggml_tensor * ids,
int id, int id,
struct ggml_tensor * b) { struct ggml_tensor * b) {
@ -4585,33 +4584,34 @@ struct ggml_tensor * ggml_mul_mat_id(
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
GGML_ASSERT(ids->ne[1] == b->ne[1]); GGML_ASSERT(ids->ne[1] == b->ne[1]);
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]); GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2); //GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
GGML_ASSERT(id >= 0 && id < ids->ne[0]); GGML_ASSERT(id >= 0 && id < ids->ne[0]);
// TODO: restore checks
bool is_node = false; bool is_node = false;
if (as[0]->grad || b->grad) { if (as->grad || b->grad) {
is_node = true; is_node = true;
} }
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] }; const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
ggml_set_op_params_i32(result, 0, id); ggml_set_op_params_i32(result, 0, id);
ggml_set_op_params_i32(result, 1, n_as);
result->op = GGML_OP_MUL_MAT_ID; result->op = GGML_OP_MUL_MAT_ID;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = ids; result->src[0] = as;
result->src[1] = b; result->src[1] = b;
result->src[2] = ids;
for (int i = 0; i < n_as; i++) { //for (int i = 0; i < n_as; i++) {
struct ggml_tensor * a = as[i]; // struct ggml_tensor * a = as[i];
GGML_ASSERT(ggml_are_same_shape(as[0], a)); // GGML_ASSERT(ggml_are_same_shape(as[0], a));
GGML_ASSERT(ggml_can_mul_mat(a, b)); // GGML_ASSERT(ggml_can_mul_mat(a, b));
GGML_ASSERT(!ggml_is_transposed(a)); // GGML_ASSERT(!ggml_is_transposed(a));
result->src[i + 2] = a; // result->src[i + 2] = a;
} //}
return result; return result;
} }
@ -10948,10 +10948,9 @@ static void ggml_compute_forward_mul_mat_id(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
const struct ggml_tensor * ids = dst->src[0]; const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src1 = dst->src[1];
const struct ggml_tensor * ids = dst->src[2];
const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
@ -10982,12 +10981,15 @@ static void ggml_compute_forward_mul_mat_id(
GGML_ASSERT(nb2 <= nb3); GGML_ASSERT(nb2 <= nb3);
// broadcast factors // broadcast factors
const int64_t r2 = ne12/ne02; //const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03; //const int64_t r3 = ne13/ne03;
// broadcast is not supported with mmid
assert(ne12 == 1);
assert(ne13 == 1);
// row groups // row groups
const int id = ggml_get_op_params_i32(dst, 0); const int id = ggml_get_op_params_i32(dst, 0);
const int n_as = ggml_get_op_params_i32(dst, 1); const int n_as = src0->ne[2]; //ggml_get_op_params_i32(dst, 1);
char * wdata_src1_end = (src1->type == vec_dot_type) ? char * wdata_src1_end = (src1->type == vec_dot_type) ?
(char *) params->wdata : (char *) params->wdata :
@ -11047,7 +11049,8 @@ static void ggml_compute_forward_mul_mat_id(
continue; continue;
} }
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2]; //const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
size_t src0_offset = src0->nb[2]*cur_a;
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10); const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -11082,9 +11085,6 @@ static void ggml_compute_forward_mul_mat_id(
continue; continue;
} }
assert(ne12 % ne02 == 0);
assert(ne13 % ne03 == 0);
// block-tiling attempt // block-tiling attempt
const int64_t blck_0 = 16; const int64_t blck_0 = 16;
const int64_t blck_1 = 16; const int64_t blck_1 = 16;
@ -11101,14 +11101,14 @@ static void ggml_compute_forward_mul_mat_id(
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11); const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
// broadcast src0 into src1 // broadcast src0 into src1
const int64_t i03 = i13/r3; //const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2; //const int64_t i02 = i12/r2;
const int64_t i1 = i11; const int64_t i1 = i11;
const int64_t i2 = i12; const int64_t i2 = i12;
const int64_t i3 = i13; const int64_t i3 = i13;
const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03); const char * src0_row = (const char *) src0->data + src0_offset;
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@ -18464,13 +18464,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
{ {
cur = 0; cur = 0;
const struct ggml_tensor * src0 = node->src[2]; const struct ggml_tensor * src0 = node->src[0];
const struct ggml_tensor * src1 = node->src[1]; const struct ggml_tensor * src1 = node->src[1];
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type; const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
if (src1->type != vec_dot_type) { if (src1->type != vec_dot_type) {
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
} }
const int n_as = ggml_get_op_params_i32(node, 1); const int n_as = src0->ne[2];
cur += GGML_PAD(cur, sizeof(int64_t)); // align cur += GGML_PAD(cur, sizeof(int64_t)); // align
cur += n_as * sizeof(int64_t); // matrix_row_counts cur += n_as * sizeof(int64_t); // matrix_row_counts
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows

3
ggml.h
View file

@ -1164,8 +1164,7 @@ extern "C" {
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b) // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
GGML_API struct ggml_tensor * ggml_mul_mat_id( GGML_API struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * const as[], struct ggml_tensor * as,
int n_as,
struct ggml_tensor * ids, struct ggml_tensor * ids,
int id, int id,
struct ggml_tensor * b); struct ggml_tensor * b);

View file

@ -1864,6 +1864,10 @@ struct llama_layer {
// ff MoE // ff MoE
struct ggml_tensor * ffn_gate_inp; struct ggml_tensor * ffn_gate_inp;
struct ggml_tensor * ffn_gate_exps;//[LLAMA_MAX_EXPERTS];
struct ggml_tensor * ffn_down_exps;//[LLAMA_MAX_EXPERTS];
struct ggml_tensor * ffn_up_exps ;//[LLAMA_MAX_EXPERTS];
struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS]; struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS]; struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS]; struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
@ -3170,7 +3174,7 @@ struct llama_model_loader {
return weight; return weight;
} }
} }
throw std::runtime_error(format("tensor %s not found", name)); throw std::runtime_error(format("tensor '%s' not found", name));
} }
struct ggml_tensor * get_tensor_meta(const char * name) const { struct ggml_tensor * get_tensor_meta(const char * name) const {
@ -3260,6 +3264,10 @@ struct llama_model_loader {
*last = 0; *last = 0;
*addr = mapping->addr; *addr = mapping->addr;
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
// hack to skip moe merged tensor
if (strlen(ggml_get_name(tensor)) == 0) {
continue;
}
const auto & w = get_weights(ggml_get_name(tensor)); const auto & w = get_weights(ggml_get_name(tensor));
if (w.idx != idx) { if (w.idx != idx) {
continue; continue;
@ -3304,6 +3312,11 @@ struct llama_model_loader {
std::vector<no_init<uint8_t>> read_buf; std::vector<no_init<uint8_t>> read_buf;
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
// hack to skip moe merged tensor
if (strlen(ggml_get_name(cur)) == 0) {
continue;
}
if (progress_callback) { if (progress_callback) {
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
return false; return false;
@ -4358,6 +4371,10 @@ static bool llm_load_tensors(
// create one context per buffer type // create one context per buffer type
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
// hack for moe merged tensors
ctx_size += ggml_tensor_overhead()*hparams.n_expert*n_layer;
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
for (auto & it : buft_layer_count) { for (auto & it : buft_layer_count) {
struct ggml_init_params params = { struct ggml_init_params params = {
@ -4451,11 +4468,30 @@ static bool llm_load_tensors(
GGML_ASSERT(hparams.n_expert > 0); GGML_ASSERT(hparams.n_expert > 0);
GGML_ASSERT(hparams.n_expert_used > 0); GGML_ASSERT(hparams.n_expert_used > 0);
// hack to merge tensors, need to clean this up
// merged tensors
ggml_type type = ml.get_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type, n_embd, n_ff, hparams.n_expert);
layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type, n_ff, n_embd, hparams.n_expert);
layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type, n_embd, n_ff, hparams.n_expert);
// MoE branch // MoE branch
for (uint32_t x = 0; x < hparams.n_expert; ++x) { for (uint32_t x = 0; x < hparams.n_expert; ++x) {
layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}); // hack
layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}); // individual tensors as views
layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}); layer.ffn_gate_exp[x] = ggml_view_2d(ctx_split, layer.ffn_gate_exps, n_embd, n_ff, layer.ffn_gate_exps->nb[1], layer.ffn_gate_exps->nb[2]*x);
layer.ffn_down_exp[x] = ggml_view_2d(ctx_split, layer.ffn_down_exps, n_ff, n_embd, layer.ffn_down_exps->nb[1], layer.ffn_down_exps->nb[2]*x);
layer.ffn_up_exp[x] = ggml_view_2d(ctx_split, layer.ffn_up_exps, n_embd, n_ff, layer.ffn_up_exps->nb[1], layer.ffn_up_exps->nb[2]*x);
ggml_set_name(layer.ffn_gate_exp[x], tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x).c_str());
ggml_set_name(layer.ffn_down_exp[x], tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x).c_str());
ggml_set_name(layer.ffn_up_exp[x], tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x).c_str());
ml.n_created += 3; // hack
//layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
//layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
//layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
} }
} }
} }
@ -4500,9 +4536,10 @@ static bool llm_load_tensors(
// MoE branch // MoE branch
for (uint32_t x = 0; x < hparams.n_expert; ++x) { for (uint32_t x = 0; x < hparams.n_expert; ++x) {
layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}); GGML_ASSERT(!"not implemented");
layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}); //layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}); //layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
//layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
} }
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
@ -6284,10 +6321,10 @@ struct llm_build_context {
for (int i = 0; i < n_expert_used; ++i) { for (int i = 0; i < n_expert_used; ++i) {
ggml_tensor * cur_expert; ggml_tensor * cur_expert;
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur); ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
cb(cur_up, "ffn_moe_up", il); cb(cur_up, "ffn_moe_up", il);
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur); ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
cb(cur_gate, "ffn_moe_gate", il); cb(cur_gate, "ffn_moe_gate", il);
cur_gate = ggml_silu(ctx0, cur_gate); cur_gate = ggml_silu(ctx0, cur_gate);
@ -6296,7 +6333,7 @@ struct llm_build_context {
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd] cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
cb(cur_expert, "ffn_moe_gate_par", il); cb(cur_expert, "ffn_moe_gate_par", il);
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd] cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
cb(cur_expert, "ffn_moe_down", il); cb(cur_expert, "ffn_moe_down", il);
cur_expert = ggml_mul(ctx0, cur_expert, cur_expert = ggml_mul(ctx0, cur_expert,
@ -6818,10 +6855,10 @@ struct llm_build_context {
for (int i = 0; i < n_expert_used; ++i) { for (int i = 0; i < n_expert_used; ++i) {
ggml_tensor * cur_expert; ggml_tensor * cur_expert;
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur); ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
cb(cur_up, "ffn_moe_up", il); cb(cur_up, "ffn_moe_up", il);
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur); ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
cb(cur_gate, "ffn_moe_gate", il); cb(cur_gate, "ffn_moe_gate", il);
//GeLU //GeLU
@ -6831,7 +6868,7 @@ struct llm_build_context {
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd] cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
cb(cur_expert, "ffn_moe_gate_par", il); cb(cur_expert, "ffn_moe_gate_par", il);
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd] cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
cb(cur_expert, "ffn_moe_down", il); cb(cur_expert, "ffn_moe_down", il);
cur_expert = ggml_mul(ctx0, cur_expert, cur_expert = ggml_mul(ctx0, cur_expert,