diff --git a/common/common.cpp b/common/common.cpp index 3d4842a5e..b3425ab09 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -149,8 +149,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.seed = std::stoul(argv[i]); - } else if (arg == "--use-awq") { - params.use_awq = true; } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; @@ -811,7 +809,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" (can be specified more than once for multiple prompts).\n"); printf(" --color colorise output to distinguish prompt and user input from generations\n"); printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - printf(" --use-awq Using AWQ quantization model in inferences\n"); printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); printf(" -tb N, --threads-batch N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); @@ -1021,7 +1018,6 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; - mparams.use_awq = params.use_awq; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; } else { diff --git a/common/common.h b/common/common.h index a2046d70f..e87ce1133 100644 --- a/common/common.h +++ b/common/common.h @@ -125,7 +125,6 @@ struct gpt_params { bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading - bool use_awq = false; // use AWQ quantization infer std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_v = "f16"; // KV cache data type for the V diff --git a/llama.cpp b/llama.cpp index 5c0d8b6c3..7a9d17070 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1261,7 +1261,6 @@ struct llama_hparams { float f_clamp_kqv; float f_max_alibi_bias; - bool use_awq; bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; @@ -3478,9 +3477,9 @@ static bool llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - if (model.hparams.use_awq) { - layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend); - } + + // AWQ ScaleActivation layer + layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false); } } break; case LLM_ARCH_STABLELM: @@ -3754,7 +3753,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); model.hparams.vocab_only = params.vocab_only; - model.hparams.use_awq = params.use_awq; llm_load_arch (ml, model); llm_load_hparams(ml, model); @@ -3800,7 +3798,6 @@ enum llm_rope_type { enum llm_ffn_op_type { LLM_FFN_SILU, LLM_FFN_GELU, - LLM_FFN_GELU_ACT, LLM_FFN_RELU, LLM_FFN_RELU_SQR, }; @@ -3968,6 +3965,7 @@ static struct ggml_tensor * llm_build_ffn( struct ggml_tensor * gate_b, struct ggml_tensor * down, struct ggml_tensor * down_b, + struct ggml_tensor * act_scales, llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, const llm_build_cb & cb, @@ -4012,6 +4010,10 @@ static struct ggml_tensor * llm_build_ffn( { cur = ggml_gelu(ctx, cur); cb(cur, "ffn_gelu", il); + if (act_scales != NULL) { + cur = ggml_div(ctx, cur, act_scales); + cb(cur, "ffn_act", il); + } } break; case LLM_FFN_RELU: { @@ -4045,55 +4047,6 @@ static struct ggml_tensor * llm_build_ffn( return cur; } -static struct ggml_tensor * llm_build_ffn_mpt_awq( - struct ggml_context * ctx, - struct ggml_tensor * cur, - struct ggml_tensor * up, - struct ggml_tensor * up_b, - struct ggml_tensor * down, - struct ggml_tensor * down_b, - struct ggml_tensor * act_scales, - llm_ffn_op_type type_op, - llm_ffn_gate_type type_gate, - const llm_build_cb & cb, - int il) { - struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur); - cb(tmp, "ffn_up", il); - - if (up_b) { - tmp = ggml_add(ctx, tmp, up_b); - cb(tmp, "ffn_up_b", il); - } - - cur = tmp; - - switch (type_op) { - case LLM_FFN_GELU_ACT: - { - cur = ggml_gelu(ctx, cur); - cb(cur, "ffn_relu", il); - cur = ggml_div(ctx, cur, act_scales); - cb(cur, "ffn_div(gelu)", il); - } break; - } - - if (type_gate == LLM_FFN_PAR) { - cur = ggml_mul(ctx, cur, tmp); - cb(cur, "ffn_gate_par", il); - } - - cur = ggml_mul_mat(ctx, down, cur); - if (down_b) { - cb(cur, "ffn_down", il); - } - - if (down_b) { - cur = ggml_add(ctx, cur, down_b); - } - - return cur; -} - // if max_alibi_bias > 0 then apply ALiBi static struct ggml_tensor * llm_build_kqv( struct ggml_context * ctx, @@ -4379,6 +4332,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } else { @@ -4558,6 +4512,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -4672,6 +4627,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -4776,6 +4732,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -4980,6 +4937,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5066,6 +5024,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5161,6 +5120,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5246,19 +5206,12 @@ struct llm_build_context { NULL, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - if (hparams.use_awq) { - cur = llm_build_ffn_mpt_awq(ctx0, cur, - model.layers[il].ffn_up, NULL, - model.layers[il].ffn_down, NULL, - model.layers[il].ffn_act, - LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il); - } else { - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - NULL, NULL, - model.layers[il].ffn_down, NULL, - LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - } + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + model.layers[il].ffn_act, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5366,6 +5319,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5478,6 +5432,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5585,6 +5540,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(ffn_output, "ffn_out", il); } @@ -9111,7 +9067,6 @@ struct llama_model_params llama_model_default_params() { /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, - /*.use_awq =*/ false, }; #ifdef GGML_USE_METAL diff --git a/llama.h b/llama.h index 78d735448..af76bae2d 100644 --- a/llama.h +++ b/llama.h @@ -195,7 +195,6 @@ extern "C" { bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM - bool use_awq; // whether to use awq quantization }; struct llama_context_params {