phi3 : duplicate rope factors in each layer

phi3 : set phi-3 model type as 14B

model loader : simplify the process for duplicating model tensors

llama-bench : remove default pg test
This commit is contained in:
slaren 2024-05-21 23:08:51 +02:00
parent 201cc11afa
commit 477973d2e1
2 changed files with 35 additions and 52 deletions

View file

@ -195,7 +195,7 @@ static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* model */ {"models/7B/ggml-model-q4_0.gguf"},
/* n_prompt */ {512}, /* n_prompt */ {512},
/* n_gen */ {128}, /* n_gen */ {128},
/* n_pg */ {{512, 128}}, /* n_pg */ {},
/* n_batch */ {2048}, /* n_batch */ {2048},
/* n_ubatch */ {512}, /* n_ubatch */ {512},
/* type_k */ {GGML_TYPE_F16}, /* type_k */ {GGML_TYPE_F16},

View file

@ -1940,6 +1940,10 @@ struct llama_layer {
// mamba bias // mamba bias
struct ggml_tensor * ssm_conv1d_b; struct ggml_tensor * ssm_conv1d_b;
struct ggml_tensor * ssm_dt_b; struct ggml_tensor * ssm_dt_b;
// long rope factors
struct ggml_tensor * rope_long = nullptr;
struct ggml_tensor * rope_short = nullptr;
}; };
struct llama_kv_cell { struct llama_kv_cell {
@ -2111,10 +2115,6 @@ struct llama_model {
struct ggml_tensor * output; struct ggml_tensor * output;
struct ggml_tensor * output_b; struct ggml_tensor * output_b;
// long rope factors
struct ggml_tensor * rope_long = nullptr;
struct ggml_tensor * rope_short = nullptr;
std::vector<llama_layer> layers; std::vector<llama_layer> layers;
llama_split_mode split_mode; llama_split_mode split_mode;
@ -3425,11 +3425,15 @@ struct llama_model_loader {
return get_tensor_meta(get_tensor_name(i)); return get_tensor_meta(get_tensor_name(i));
} }
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) { struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur); struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
ggml_set_name(tensor, ggml_get_name(cur)); ggml_set_name(tensor, ggml_get_name(cur));
n_created++; if (duplicated) {
size_data += ggml_nbytes(cur);
} else {
n_created++;
}
return tensor; return tensor;
} }
@ -3464,14 +3468,14 @@ struct llama_model_loader {
return cur; return cur;
} }
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) { struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true, bool duplicated = false) {
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
if (cur == NULL) { if (cur == NULL) {
return NULL; return NULL;
} }
return create_tensor_for(ctx, cur); return create_tensor_for(ctx, cur, duplicated);
} }
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) { struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
@ -4136,6 +4140,7 @@ static void llm_load_hparams(
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_1B; break; case 24: model.type = e_model::MODEL_1B; break;
case 32: model.type = e_model::MODEL_3B; break; case 32: model.type = e_model::MODEL_3B; break;
case 40: model.type = e_model::MODEL_14B; break;
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
@ -4965,9 +4970,7 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
// if output is NULL, init from the input tok embed // if output is NULL, init from the input tok embed
if (model.output == NULL) { if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
} }
} }
} }
@ -5045,9 +5048,7 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
// if output is NULL, init from the input tok embed // if output is NULL, init from the input tok embed
if (model.output == NULL) { if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
} }
} }
@ -5174,9 +5175,7 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
if (!model.output) { if (!model.output) {
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true); // needs to be on GPU
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
} }
} }
@ -5211,9 +5210,7 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
if (!model.output) { if (!model.output) {
// needs to be on GPU // needs to be on GPU
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
} }
} }
@ -5389,9 +5386,7 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
if (!model.output) { if (!model.output) {
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true); // needs to be on GPU
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
} }
} }
@ -5511,9 +5506,7 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
// if output is NULL, init from the input tok embed // if output is NULL, init from the input tok embed
if (model.output == NULL) { if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
} }
} }
@ -5639,9 +5632,6 @@ static bool llm_load_tensors(
{ {
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }); model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
model.rope_long = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, false);
model.rope_short = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, false);
// output // output
{ {
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }); model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
@ -5663,6 +5653,9 @@ static bool llm_load_tensors(
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, false, i != 0);
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, false, i != 0);
} }
} break; } break;
case LLM_ARCH_PLAMO: case LLM_ARCH_PLAMO:
@ -5831,9 +5824,7 @@ static bool llm_load_tensors(
// output // output
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true); // same as tok_embd, duplicated to allow offloading
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
const int64_t n_ff = hparams.n_ff; const int64_t n_ff = hparams.n_ff;
const int64_t n_embd_head_k = hparams.n_embd_head_k; const int64_t n_embd_head_k = hparams.n_embd_head_k;
@ -5871,9 +5862,7 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
// if output is NULL, init from the input tok embed // if output is NULL, init from the input tok embed
if (model.output == NULL) { if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
} }
} }
@ -5927,9 +5916,7 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
// if output is NULL, init from the input tok embed, duplicated to allow offloading // if output is NULL, init from the input tok embed, duplicated to allow offloading
if (model.output == NULL) { if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
} }
} }
@ -5990,9 +5977,7 @@ static bool llm_load_tensors(
{ {
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
// init output from the input tok embed // init output from the input tok embed
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
} }
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
@ -6027,9 +6012,7 @@ static bool llm_load_tensors(
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
// if output is NULL, init from the input tok embed // if output is NULL, init from the input tok embed
if (model.output == NULL) { if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
} }
} }
@ -6872,9 +6855,9 @@ struct llm_build_context {
cb(lctx.inp_K_shift, "K_shift", -1); cb(lctx.inp_K_shift, "K_shift", -1);
ggml_set_input(lctx.inp_K_shift); ggml_set_input(lctx.inp_K_shift);
struct ggml_tensor * rope_factors = build_rope_factors();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * rope_factors = build_rope_factors(il);
struct ggml_tensor * tmp = struct ggml_tensor * tmp =
// we rotate only the first n_rot dimensions // we rotate only the first n_rot dimensions
ggml_rope_ext_inplace(ctx0, ggml_rope_ext_inplace(ctx0,
@ -6988,15 +6971,15 @@ struct llm_build_context {
return lctx.inp_pos; return lctx.inp_pos;
} }
struct ggml_tensor * build_rope_factors() { struct ggml_tensor * build_rope_factors(int il) {
// choose long/short freq factors based on the context size // choose long/short freq factors based on the context size
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) { if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
return model.rope_long; return model.layers[il].rope_long;
} }
return model.rope_short; return model.layers[il].rope_short;
} }
struct ggml_tensor * build_inp_out_ids() { struct ggml_tensor * build_inp_out_ids() {
@ -9117,14 +9100,14 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads) // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
// rope freq factors for 128k context
struct ggml_tensor * rope_factors = build_rope_factors();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
auto residual = inpL; auto residual = inpL;
// self-attention // self-attention
{ {
// rope freq factors for 128k context
struct ggml_tensor * rope_factors = build_rope_factors(il);
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams, struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, model.layers[il].attn_norm,
NULL, NULL,