uppdate: remove use_awq arg
This commit is contained in:
parent
2187a8debe
commit
13f60c417d
4 changed files with 24 additions and 75 deletions
|
@ -149,8 +149,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.seed = std::stoul(argv[i]);
|
params.seed = std::stoul(argv[i]);
|
||||||
} else if (arg == "--use-awq") {
|
|
||||||
params.use_awq = true;
|
|
||||||
} else if (arg == "-t" || arg == "--threads") {
|
} else if (arg == "-t" || arg == "--threads") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -811,7 +809,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" (can be specified more than once for multiple prompts).\n");
|
printf(" (can be specified more than once for multiple prompts).\n");
|
||||||
printf(" --color colorise output to distinguish prompt and user input from generations\n");
|
printf(" --color colorise output to distinguish prompt and user input from generations\n");
|
||||||
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
||||||
printf(" --use-awq Using AWQ quantization model in inferences\n");
|
|
||||||
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
|
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
|
||||||
printf(" -tb N, --threads-batch N\n");
|
printf(" -tb N, --threads-batch N\n");
|
||||||
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||||
|
@ -1021,7 +1018,6 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||||
mparams.tensor_split = params.tensor_split;
|
mparams.tensor_split = params.tensor_split;
|
||||||
mparams.use_mmap = params.use_mmap;
|
mparams.use_mmap = params.use_mmap;
|
||||||
mparams.use_mlock = params.use_mlock;
|
mparams.use_mlock = params.use_mlock;
|
||||||
mparams.use_awq = params.use_awq;
|
|
||||||
if (params.kv_overrides.empty()) {
|
if (params.kv_overrides.empty()) {
|
||||||
mparams.kv_overrides = NULL;
|
mparams.kv_overrides = NULL;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -125,7 +125,6 @@ struct gpt_params {
|
||||||
bool infill = false; // use infill mode
|
bool infill = false; // use infill mode
|
||||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
bool no_kv_offload = false; // disable KV offloading
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
bool use_awq = false; // use AWQ quantization infer
|
|
||||||
|
|
||||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||||
|
|
93
llama.cpp
93
llama.cpp
|
@ -1261,7 +1261,6 @@ struct llama_hparams {
|
||||||
float f_clamp_kqv;
|
float f_clamp_kqv;
|
||||||
float f_max_alibi_bias;
|
float f_max_alibi_bias;
|
||||||
|
|
||||||
bool use_awq;
|
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
if (this->vocab_only != other.vocab_only) return true;
|
if (this->vocab_only != other.vocab_only) return true;
|
||||||
|
@ -3478,9 +3477,9 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
if (model.hparams.use_awq) {
|
|
||||||
layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend);
|
// AWQ ScaleActivation layer
|
||||||
}
|
layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_STABLELM:
|
case LLM_ARCH_STABLELM:
|
||||||
|
@ -3754,7 +3753,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
|
||||||
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
||||||
|
|
||||||
model.hparams.vocab_only = params.vocab_only;
|
model.hparams.vocab_only = params.vocab_only;
|
||||||
model.hparams.use_awq = params.use_awq;
|
|
||||||
|
|
||||||
llm_load_arch (ml, model);
|
llm_load_arch (ml, model);
|
||||||
llm_load_hparams(ml, model);
|
llm_load_hparams(ml, model);
|
||||||
|
@ -3800,7 +3798,6 @@ enum llm_rope_type {
|
||||||
enum llm_ffn_op_type {
|
enum llm_ffn_op_type {
|
||||||
LLM_FFN_SILU,
|
LLM_FFN_SILU,
|
||||||
LLM_FFN_GELU,
|
LLM_FFN_GELU,
|
||||||
LLM_FFN_GELU_ACT,
|
|
||||||
LLM_FFN_RELU,
|
LLM_FFN_RELU,
|
||||||
LLM_FFN_RELU_SQR,
|
LLM_FFN_RELU_SQR,
|
||||||
};
|
};
|
||||||
|
@ -3968,6 +3965,7 @@ static struct ggml_tensor * llm_build_ffn(
|
||||||
struct ggml_tensor * gate_b,
|
struct ggml_tensor * gate_b,
|
||||||
struct ggml_tensor * down,
|
struct ggml_tensor * down,
|
||||||
struct ggml_tensor * down_b,
|
struct ggml_tensor * down_b,
|
||||||
|
struct ggml_tensor * act_scales,
|
||||||
llm_ffn_op_type type_op,
|
llm_ffn_op_type type_op,
|
||||||
llm_ffn_gate_type type_gate,
|
llm_ffn_gate_type type_gate,
|
||||||
const llm_build_cb & cb,
|
const llm_build_cb & cb,
|
||||||
|
@ -4012,6 +4010,10 @@ static struct ggml_tensor * llm_build_ffn(
|
||||||
{
|
{
|
||||||
cur = ggml_gelu(ctx, cur);
|
cur = ggml_gelu(ctx, cur);
|
||||||
cb(cur, "ffn_gelu", il);
|
cb(cur, "ffn_gelu", il);
|
||||||
|
if (act_scales != NULL) {
|
||||||
|
cur = ggml_div(ctx, cur, act_scales);
|
||||||
|
cb(cur, "ffn_act", il);
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_FFN_RELU:
|
case LLM_FFN_RELU:
|
||||||
{
|
{
|
||||||
|
@ -4045,55 +4047,6 @@ static struct ggml_tensor * llm_build_ffn(
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_tensor * llm_build_ffn_mpt_awq(
|
|
||||||
struct ggml_context * ctx,
|
|
||||||
struct ggml_tensor * cur,
|
|
||||||
struct ggml_tensor * up,
|
|
||||||
struct ggml_tensor * up_b,
|
|
||||||
struct ggml_tensor * down,
|
|
||||||
struct ggml_tensor * down_b,
|
|
||||||
struct ggml_tensor * act_scales,
|
|
||||||
llm_ffn_op_type type_op,
|
|
||||||
llm_ffn_gate_type type_gate,
|
|
||||||
const llm_build_cb & cb,
|
|
||||||
int il) {
|
|
||||||
struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
|
|
||||||
cb(tmp, "ffn_up", il);
|
|
||||||
|
|
||||||
if (up_b) {
|
|
||||||
tmp = ggml_add(ctx, tmp, up_b);
|
|
||||||
cb(tmp, "ffn_up_b", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
cur = tmp;
|
|
||||||
|
|
||||||
switch (type_op) {
|
|
||||||
case LLM_FFN_GELU_ACT:
|
|
||||||
{
|
|
||||||
cur = ggml_gelu(ctx, cur);
|
|
||||||
cb(cur, "ffn_relu", il);
|
|
||||||
cur = ggml_div(ctx, cur, act_scales);
|
|
||||||
cb(cur, "ffn_div(gelu)", il);
|
|
||||||
} break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (type_gate == LLM_FFN_PAR) {
|
|
||||||
cur = ggml_mul(ctx, cur, tmp);
|
|
||||||
cb(cur, "ffn_gate_par", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
cur = ggml_mul_mat(ctx, down, cur);
|
|
||||||
if (down_b) {
|
|
||||||
cb(cur, "ffn_down", il);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (down_b) {
|
|
||||||
cur = ggml_add(ctx, cur, down_b);
|
|
||||||
}
|
|
||||||
|
|
||||||
return cur;
|
|
||||||
}
|
|
||||||
|
|
||||||
// if max_alibi_bias > 0 then apply ALiBi
|
// if max_alibi_bias > 0 then apply ALiBi
|
||||||
static struct ggml_tensor * llm_build_kqv(
|
static struct ggml_tensor * llm_build_kqv(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -4379,6 +4332,7 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up, NULL,
|
model.layers[il].ffn_up, NULL,
|
||||||
model.layers[il].ffn_gate, NULL,
|
model.layers[il].ffn_gate, NULL,
|
||||||
model.layers[il].ffn_down, NULL,
|
model.layers[il].ffn_down, NULL,
|
||||||
|
NULL,
|
||||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
} else {
|
} else {
|
||||||
|
@ -4558,6 +4512,7 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up, NULL,
|
model.layers[il].ffn_up, NULL,
|
||||||
model.layers[il].ffn_gate, NULL,
|
model.layers[il].ffn_gate, NULL,
|
||||||
model.layers[il].ffn_down, NULL,
|
model.layers[il].ffn_down, NULL,
|
||||||
|
NULL,
|
||||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
}
|
}
|
||||||
|
@ -4672,6 +4627,7 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up, NULL,
|
model.layers[il].ffn_up, NULL,
|
||||||
NULL, NULL,
|
NULL, NULL,
|
||||||
model.layers[il].ffn_down, NULL,
|
model.layers[il].ffn_down, NULL,
|
||||||
|
NULL,
|
||||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
}
|
}
|
||||||
|
@ -4776,6 +4732,7 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||||
NULL, NULL,
|
NULL, NULL,
|
||||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||||
|
NULL,
|
||||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
}
|
}
|
||||||
|
@ -4980,6 +4937,7 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||||
NULL, NULL,
|
NULL, NULL,
|
||||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||||
|
NULL,
|
||||||
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
}
|
}
|
||||||
|
@ -5066,6 +5024,7 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up, NULL,
|
model.layers[il].ffn_up, NULL,
|
||||||
model.layers[il].ffn_gate, NULL,
|
model.layers[il].ffn_gate, NULL,
|
||||||
model.layers[il].ffn_down, NULL,
|
model.layers[il].ffn_down, NULL,
|
||||||
|
NULL,
|
||||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
}
|
}
|
||||||
|
@ -5161,6 +5120,7 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||||
NULL, NULL,
|
NULL, NULL,
|
||||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||||
|
NULL,
|
||||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
}
|
}
|
||||||
|
@ -5246,19 +5206,12 @@ struct llm_build_context {
|
||||||
NULL,
|
NULL,
|
||||||
LLM_NORM, cb, il);
|
LLM_NORM, cb, il);
|
||||||
cb(cur, "ffn_norm", il);
|
cb(cur, "ffn_norm", il);
|
||||||
if (hparams.use_awq) {
|
cur = llm_build_ffn(ctx0, cur,
|
||||||
cur = llm_build_ffn_mpt_awq(ctx0, cur,
|
model.layers[il].ffn_up, NULL,
|
||||||
model.layers[il].ffn_up, NULL,
|
NULL, NULL,
|
||||||
model.layers[il].ffn_down, NULL,
|
model.layers[il].ffn_down, NULL,
|
||||||
model.layers[il].ffn_act,
|
model.layers[il].ffn_act,
|
||||||
LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il);
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||||
} else {
|
|
||||||
cur = llm_build_ffn(ctx0, cur,
|
|
||||||
model.layers[il].ffn_up, NULL,
|
|
||||||
NULL, NULL,
|
|
||||||
model.layers[il].ffn_down, NULL,
|
|
||||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
|
||||||
}
|
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5366,6 +5319,7 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up, NULL,
|
model.layers[il].ffn_up, NULL,
|
||||||
model.layers[il].ffn_gate, NULL,
|
model.layers[il].ffn_gate, NULL,
|
||||||
model.layers[il].ffn_down, NULL,
|
model.layers[il].ffn_down, NULL,
|
||||||
|
NULL,
|
||||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
}
|
}
|
||||||
|
@ -5478,6 +5432,7 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up, NULL,
|
model.layers[il].ffn_up, NULL,
|
||||||
model.layers[il].ffn_gate, NULL,
|
model.layers[il].ffn_gate, NULL,
|
||||||
model.layers[il].ffn_down, NULL,
|
model.layers[il].ffn_down, NULL,
|
||||||
|
NULL,
|
||||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
}
|
}
|
||||||
|
@ -5585,6 +5540,7 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||||
NULL, NULL,
|
NULL, NULL,
|
||||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||||
|
NULL,
|
||||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||||
cb(ffn_output, "ffn_out", il);
|
cb(ffn_output, "ffn_out", il);
|
||||||
}
|
}
|
||||||
|
@ -9111,7 +9067,6 @@ struct llama_model_params llama_model_default_params() {
|
||||||
/*.vocab_only =*/ false,
|
/*.vocab_only =*/ false,
|
||||||
/*.use_mmap =*/ true,
|
/*.use_mmap =*/ true,
|
||||||
/*.use_mlock =*/ false,
|
/*.use_mlock =*/ false,
|
||||||
/*.use_awq =*/ false,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
|
|
1
llama.h
1
llama.h
|
@ -195,7 +195,6 @@ extern "C" {
|
||||||
bool vocab_only; // only load the vocabulary, no weights
|
bool vocab_only; // only load the vocabulary, no weights
|
||||||
bool use_mmap; // use mmap if possible
|
bool use_mmap; // use mmap if possible
|
||||||
bool use_mlock; // force system to keep model in RAM
|
bool use_mlock; // force system to keep model in RAM
|
||||||
bool use_awq; // whether to use awq quantization
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue