phi3 : duplicate rope factors in each layer
phi3 : set phi-3 model type as 14B model loader : simplify the process for duplicating model tensors llama-bench : remove default pg test
This commit is contained in:
parent
201cc11afa
commit
477973d2e1
2 changed files with 35 additions and 52 deletions
|
@ -195,7 +195,7 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
||||||
/* n_prompt */ {512},
|
/* n_prompt */ {512},
|
||||||
/* n_gen */ {128},
|
/* n_gen */ {128},
|
||||||
/* n_pg */ {{512, 128}},
|
/* n_pg */ {},
|
||||||
/* n_batch */ {2048},
|
/* n_batch */ {2048},
|
||||||
/* n_ubatch */ {512},
|
/* n_ubatch */ {512},
|
||||||
/* type_k */ {GGML_TYPE_F16},
|
/* type_k */ {GGML_TYPE_F16},
|
||||||
|
|
85
llama.cpp
85
llama.cpp
|
@ -1940,6 +1940,10 @@ struct llama_layer {
|
||||||
// mamba bias
|
// mamba bias
|
||||||
struct ggml_tensor * ssm_conv1d_b;
|
struct ggml_tensor * ssm_conv1d_b;
|
||||||
struct ggml_tensor * ssm_dt_b;
|
struct ggml_tensor * ssm_dt_b;
|
||||||
|
|
||||||
|
// long rope factors
|
||||||
|
struct ggml_tensor * rope_long = nullptr;
|
||||||
|
struct ggml_tensor * rope_short = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_kv_cell {
|
struct llama_kv_cell {
|
||||||
|
@ -2111,10 +2115,6 @@ struct llama_model {
|
||||||
struct ggml_tensor * output;
|
struct ggml_tensor * output;
|
||||||
struct ggml_tensor * output_b;
|
struct ggml_tensor * output_b;
|
||||||
|
|
||||||
// long rope factors
|
|
||||||
struct ggml_tensor * rope_long = nullptr;
|
|
||||||
struct ggml_tensor * rope_short = nullptr;
|
|
||||||
|
|
||||||
std::vector<llama_layer> layers;
|
std::vector<llama_layer> layers;
|
||||||
|
|
||||||
llama_split_mode split_mode;
|
llama_split_mode split_mode;
|
||||||
|
@ -3425,11 +3425,15 @@ struct llama_model_loader {
|
||||||
return get_tensor_meta(get_tensor_name(i));
|
return get_tensor_meta(get_tensor_name(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
|
||||||
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
||||||
ggml_set_name(tensor, ggml_get_name(cur));
|
ggml_set_name(tensor, ggml_get_name(cur));
|
||||||
|
|
||||||
n_created++;
|
if (duplicated) {
|
||||||
|
size_data += ggml_nbytes(cur);
|
||||||
|
} else {
|
||||||
|
n_created++;
|
||||||
|
}
|
||||||
|
|
||||||
return tensor;
|
return tensor;
|
||||||
}
|
}
|
||||||
|
@ -3464,14 +3468,14 @@ struct llama_model_loader {
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true, bool duplicated = false) {
|
||||||
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
||||||
|
|
||||||
if (cur == NULL) {
|
if (cur == NULL) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
return create_tensor_for(ctx, cur);
|
return create_tensor_for(ctx, cur, duplicated);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
||||||
|
@ -4136,6 +4140,7 @@ static void llm_load_hparams(
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 24: model.type = e_model::MODEL_1B; break;
|
case 24: model.type = e_model::MODEL_1B; break;
|
||||||
case 32: model.type = e_model::MODEL_3B; break;
|
case 32: model.type = e_model::MODEL_3B; break;
|
||||||
|
case 40: model.type = e_model::MODEL_14B; break;
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -4965,9 +4970,7 @@ static bool llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
// if output is NULL, init from the input tok embed
|
// if output is NULL, init from the input tok embed
|
||||||
if (model.output == NULL) {
|
if (model.output == NULL) {
|
||||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
|
||||||
ml.n_created--; // artificial tensor
|
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5045,9 +5048,7 @@ static bool llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
// if output is NULL, init from the input tok embed
|
// if output is NULL, init from the input tok embed
|
||||||
if (model.output == NULL) {
|
if (model.output == NULL) {
|
||||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
|
||||||
ml.n_created--; // artificial tensor
|
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5174,9 +5175,7 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
if (!model.output) {
|
if (!model.output) {
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true); // needs to be on GPU
|
||||||
ml.n_created--; // artificial tensor
|
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5211,9 +5210,7 @@ static bool llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
if (!model.output) {
|
if (!model.output) {
|
||||||
// needs to be on GPU
|
// needs to be on GPU
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
|
||||||
ml.n_created--; // artificial tensor
|
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5389,9 +5386,7 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
if (!model.output) {
|
if (!model.output) {
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true); // needs to be on GPU
|
||||||
ml.n_created--; // artificial tensor
|
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5511,9 +5506,7 @@ static bool llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
// if output is NULL, init from the input tok embed
|
// if output is NULL, init from the input tok embed
|
||||||
if (model.output == NULL) {
|
if (model.output == NULL) {
|
||||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
|
||||||
ml.n_created--; // artificial tensor
|
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5639,9 +5632,6 @@ static bool llm_load_tensors(
|
||||||
{
|
{
|
||||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
|
||||||
|
|
||||||
model.rope_long = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, false);
|
|
||||||
model.rope_short = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, false);
|
|
||||||
|
|
||||||
// output
|
// output
|
||||||
{
|
{
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
|
||||||
|
@ -5663,6 +5653,9 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
||||||
|
|
||||||
|
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, false, i != 0);
|
||||||
|
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, false, i != 0);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_PLAMO:
|
case LLM_ARCH_PLAMO:
|
||||||
|
@ -5831,9 +5824,7 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
// output
|
// output
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true); // same as tok_embd, duplicated to allow offloading
|
||||||
ml.n_created--; // artificial tensor
|
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
|
||||||
|
|
||||||
const int64_t n_ff = hparams.n_ff;
|
const int64_t n_ff = hparams.n_ff;
|
||||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||||
|
@ -5871,9 +5862,7 @@ static bool llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
// if output is NULL, init from the input tok embed
|
// if output is NULL, init from the input tok embed
|
||||||
if (model.output == NULL) {
|
if (model.output == NULL) {
|
||||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
|
||||||
ml.n_created--; // artificial tensor
|
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5927,9 +5916,7 @@ static bool llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
||||||
if (model.output == NULL) {
|
if (model.output == NULL) {
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
|
||||||
ml.n_created--; // artificial tensor
|
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5990,9 +5977,7 @@ static bool llm_load_tensors(
|
||||||
{
|
{
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
// init output from the input tok embed
|
// init output from the input tok embed
|
||||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
|
||||||
ml.n_created--; // artificial tensor
|
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
@ -6027,9 +6012,7 @@ static bool llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
// if output is NULL, init from the input tok embed
|
// if output is NULL, init from the input tok embed
|
||||||
if (model.output == NULL) {
|
if (model.output == NULL) {
|
||||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, true);
|
||||||
ml.n_created--; // artificial tensor
|
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6872,9 +6855,9 @@ struct llm_build_context {
|
||||||
cb(lctx.inp_K_shift, "K_shift", -1);
|
cb(lctx.inp_K_shift, "K_shift", -1);
|
||||||
ggml_set_input(lctx.inp_K_shift);
|
ggml_set_input(lctx.inp_K_shift);
|
||||||
|
|
||||||
struct ggml_tensor * rope_factors = build_rope_factors();
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||||
struct ggml_tensor * tmp =
|
struct ggml_tensor * tmp =
|
||||||
// we rotate only the first n_rot dimensions
|
// we rotate only the first n_rot dimensions
|
||||||
ggml_rope_ext_inplace(ctx0,
|
ggml_rope_ext_inplace(ctx0,
|
||||||
|
@ -6988,15 +6971,15 @@ struct llm_build_context {
|
||||||
return lctx.inp_pos;
|
return lctx.inp_pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * build_rope_factors() {
|
struct ggml_tensor * build_rope_factors(int il) {
|
||||||
// choose long/short freq factors based on the context size
|
// choose long/short freq factors based on the context size
|
||||||
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||||
|
|
||||||
if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
|
if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
|
||||||
return model.rope_long;
|
return model.layers[il].rope_long;
|
||||||
}
|
}
|
||||||
|
|
||||||
return model.rope_short;
|
return model.layers[il].rope_short;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * build_inp_out_ids() {
|
struct ggml_tensor * build_inp_out_ids() {
|
||||||
|
@ -9117,14 +9100,14 @@ struct llm_build_context {
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
// rope freq factors for 128k context
|
|
||||||
struct ggml_tensor * rope_factors = build_rope_factors();
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
auto residual = inpL;
|
auto residual = inpL;
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
|
// rope freq factors for 128k context
|
||||||
|
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||||
|
|
||||||
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
||||||
model.layers[il].attn_norm,
|
model.layers[il].attn_norm,
|
||||||
NULL,
|
NULL,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue