Rename to llm_build_ffn_mpt_awq

This commit is contained in:
Le Hoang Anh 2023-12-20 10:48:00 +07:00
parent d2e9d00cbc
commit 0610672b19
3 changed files with 36 additions and 68 deletions

View file

@ -1100,11 +1100,13 @@ void llama_batch_add(
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) { std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
auto mparams = llama_model_params_from_gpt_params(params); auto mparams = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return std::make_tuple(nullptr, nullptr); return std::make_tuple(nullptr, nullptr);
} }
auto cparams = llama_context_params_from_gpt_params(params); auto cparams = llama_context_params_from_gpt_params(params);
llama_context * lctx = llama_new_context_with_model(model, cparams); llama_context * lctx = llama_new_context_with_model(model, cparams);

100
llama.cpp
View file

@ -454,8 +454,8 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act"},
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
}, },
}, },
{ {
@ -1178,6 +1178,7 @@ struct llama_hparams {
float f_clamp_kqv; float f_clamp_kqv;
float f_max_alibi_bias; float f_max_alibi_bias;
bool use_awq; bool use_awq;
bool operator!=(const llama_hparams & other) const { bool operator!=(const llama_hparams & other) const {
@ -1274,7 +1275,7 @@ struct llama_layer {
// ff bias // ff bias
struct ggml_tensor * ffn_down_b; // b2 struct ggml_tensor * ffn_down_b; // b2
struct ggml_tensor * ffn_up_b; // b3 struct ggml_tensor * ffn_up_b; // b3
struct ggml_tensor *ffn_act; struct ggml_tensor * ffn_act;
}; };
struct llama_kv_cell { struct llama_kv_cell {
@ -3423,10 +3424,10 @@ static void llm_load_tensors(
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
if (model.hparams.use_awq) { if (model.hparams.use_awq) {
layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend); layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend);
} }
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) { if (backend == GGML_BACKEND_GPU) {
if (model.hparams.use_awq) { if (model.hparams.use_awq) {
@ -3436,10 +3437,9 @@ static void llm_load_tensors(
ggml_nbytes(layer.wo) + ggml_nbytes(layer.wo) +
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm) +
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down) +
ggml_nbytes(layer.ffn_act) + ggml_nbytes(layer.ffn_up) +
ggml_nbytes(layer.ffn_up); ggml_nbytes(layer.ffn_act);
} } else {
else {
vram_weights += vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm) +
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wqkv) +
@ -3647,7 +3647,8 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
model.hparams.vocab_only = params.vocab_only; model.hparams.vocab_only = params.vocab_only;
model.hparams.use_awq = params.use_awq; model.hparams.use_awq = params.use_awq;
llm_load_arch (ml, model); llm_load_arch (ml, model);
llm_load_hparams(ml, model); llm_load_hparams(ml, model);
llm_load_vocab (ml, model); llm_load_vocab (ml, model);
@ -3935,7 +3936,7 @@ static struct ggml_tensor * llm_build_ffn(
return cur; return cur;
} }
static struct ggml_tensor *llm_build_ffn( static struct ggml_tensor * llm_build_ffn_mpt_awq(
struct ggml_context *ctx, struct ggml_context *ctx,
struct ggml_tensor *cur, struct ggml_tensor *cur,
struct ggml_tensor *up, struct ggml_tensor *up,
@ -3950,72 +3951,39 @@ static struct ggml_tensor *llm_build_ffn(
const llm_build_cb &cb, const llm_build_cb &cb,
int il) int il)
{ {
struct ggml_tensor *tmp = ggml_mul_mat(ctx, up, cur); struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
cb(tmp, "ffn_up", il); cb(tmp, "ffn_up", il);
if (up_b) if (up_b) {
{
tmp = ggml_add(ctx, tmp, up_b); tmp = ggml_add(ctx, tmp, up_b);
cb(tmp, "ffn_up_b", il); cb(tmp, "ffn_up_b", il);
} }
if (gate) cur = tmp;
{
switch (type_gate)
{
case LLM_FFN_SEQ:
{
cur = ggml_mul_mat(ctx, gate, tmp);
cb(cur, "ffn_gate", il);
}
break;
case LLM_FFN_PAR:
{
cur = ggml_mul_mat(ctx, gate, cur);
cb(cur, "ffn_gate", il);
}
break;
}
if (gate_b) switch (type_op) {
{ case LLM_FFN_GELU_ACT:
cur = ggml_add(ctx, cur, gate_b); {
cb(cur, "ffn_gate_b", il); cur = ggml_gelu(ctx, cur);
} cb(cur, "ffn_relu", il);
} struct ggml_tensor *repeat = ggml_repeat(ctx, act_scales, cur);
else cb(repeat, "ffn_repeat(scales)", il);
{ cur = ggml_div(ctx, cur, repeat);
cur = tmp; cb(cur, "ffn_div(gelu)", il);
} break;
} }
switch (type_op) if (type_gate == LLM_FFN_PAR) {
{
case LLM_FFN_GELU_ACT:
{
cur = ggml_gelu(ctx, cur);
cb(cur, "ffn_relu", il);
struct ggml_tensor *repeat = ggml_repeat(ctx, act_scales, cur);
cb(repeat, "ffn_repeat(scales)", il);
cur = ggml_div(ctx, cur, repeat);
cb(cur, "ffn_div(gelu)", il);
}
break;
}
if (type_gate == LLM_FFN_PAR)
{
cur = ggml_mul(ctx, cur, tmp); cur = ggml_mul(ctx, cur, tmp);
cb(cur, "ffn_gate_par", il); cb(cur, "ffn_gate_par", il);
} }
cur = ggml_mul_mat(ctx, down, cur); cur = ggml_mul_mat(ctx, down, cur);
if (down_b) if (down_b) {
{
cb(cur, "ffn_down", il); cb(cur, "ffn_down", il);
} }
if (down_b) if (down_b) {
{
cur = ggml_add(ctx, cur, down_b); cur = ggml_add(ctx, cur, down_b);
} }
@ -5133,21 +5101,17 @@ struct llm_build_context {
LLM_NORM, cb, il); LLM_NORM, cb, il);
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
if (hparams.use_awq) { if (hparams.use_awq) {
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn_mpt_awq(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL,
NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL,
model.layers[il].ffn_act, model.layers[il].ffn_act,
LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il);
} else {
}
else {
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL,
NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
} }
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
} }
@ -5558,7 +5522,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
{ "ffn_gate", OFFLOAD_FUNC }, { "ffn_gate", OFFLOAD_FUNC },
{ "ffn_gate_b", OFFLOAD_FUNC }, { "ffn_gate_b", OFFLOAD_FUNC },
{ "ffn_gate_par", OFFLOAD_FUNC }, { "ffn_gate_par", OFFLOAD_FUNC },
{"ffn_act", OFFLOAD_FUNC }, { "ffn_act", OFFLOAD_FUNC },
{ "ffn_down", OFFLOAD_FUNC }, { "ffn_down", OFFLOAD_FUNC },
{ "ffn_down_b", OFFLOAD_FUNC }, { "ffn_down_b", OFFLOAD_FUNC },
{ "ffn_out", OFFLOAD_FUNC }, { "ffn_out", OFFLOAD_FUNC },
@ -8864,9 +8828,9 @@ struct llama_model_params llama_model_default_params() {
/*.progress_callback_user_data =*/ nullptr, /*.progress_callback_user_data =*/ nullptr,
/*.kv_overrides =*/ nullptr, /*.kv_overrides =*/ nullptr,
/*.vocab_only =*/ false, /*.vocab_only =*/ false,
/*.use_awq =*/ false,
/*.use_mmap =*/ true, /*.use_mmap =*/ true,
/*.use_mlock =*/ false, /*.use_mlock =*/ false,
/*.use_awq =*/ false,
}; };
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
@ -8960,7 +8924,9 @@ struct llama_model * llama_load_model_from_file(
const char * path_model, const char * path_model,
struct llama_model_params params) { struct llama_model_params params) {
ggml_time_init(); ggml_time_init();
llama_model * model = new llama_model; llama_model * model = new llama_model;
unsigned cur_percentage = 0; unsigned cur_percentage = 0;
if (params.progress_callback == NULL) { if (params.progress_callback == NULL) {
params.progress_callback_user_data = &cur_percentage; params.progress_callback_user_data = &cur_percentage;
@ -9087,7 +9053,7 @@ struct llama_context * llama_new_context_with_model(
if (params.embedding){ if (params.embedding){
ctx->embedding.resize(hparams.n_embd); ctx->embedding.resize(hparams.n_embd);
} }
{ {
static const size_t tensor_alignment = 32; static const size_t tensor_alignment = 32;
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data

View file

@ -192,7 +192,7 @@ extern "C" {
bool vocab_only; // only load the vocabulary, no weights bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM bool use_mlock; // force system to keep model in RAM
bool use_awq; // whether to use awq quantization bool use_awq; // whether to use awq quantization
}; };
struct llama_context_params { struct llama_context_params {