diff --git a/awqpy/README.md b/awqpy/README.md index 8f70f3d3b..11d25d948 100644 --- a/awqpy/README.md +++ b/awqpy/README.md @@ -56,7 +56,7 @@ The perplexity measurements in table above are done against the `wikitext2` test ## Results Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison -We use three types of llamacpp quantization methods to work with our version, including q4, q4_1, and q2_k +We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k ### Llama 7B (Build with OpenBLAS) diff --git a/common/common.cpp b/common/common.cpp index dadf95f5c..def94b4e1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -150,6 +150,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } params.seed = std::stoul(argv[i]); } else if (arg == "-awq" || arg == "--use-awq") { + if (++i >= argc) { + invalid_param = true; + break; + } params.use_awq = true; } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { @@ -806,7 +810,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" (can be specified more than once for multiple prompts).\n"); printf(" --color colorise output to distinguish prompt and user input from generations\n"); printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - printf(" -awq SEED, -use-awq Using AWQ quantization model in inferences\n"); + printf(" -awq, --use-awq Using AWQ quantization model in inferences\n"); printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); printf(" -tb N, --threads-batch N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 3bf8a9f13..c8d24b844 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1029,7 +1029,6 @@ print(f"Loading model: {dir_model.name}") hparams = Model.load_hparams(dir_model) - with torch.inference_mode(): model_class = Model.from_model_architecture(hparams["architectures"][0]) model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) diff --git a/convert.py b/convert.py index f1cdf762a..ea1392338 100755 --- a/convert.py +++ b/convert.py @@ -1163,7 +1163,7 @@ def main(args_in: list[str] | None = None) -> None: print("Saving new weighted model ...") add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) print(f"Saved weighted model at {tmp_model_path}.") - args.model = tmp_model_path + args.model = tmp_model_path if args.dump_single: model_plus = lazy_load_file(args.model) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 651ff0031..26b74dd06 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -164,7 +164,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.w1", # qwen ), - # Awq-activation gate + # AWQ-activation gate MODEL_TENSOR.FFN_ACT: ( "transformer.blocks.{bid}.ffn.act", # mpt ), diff --git a/llama.cpp b/llama.cpp index 40de8ae77..9ef48bfa7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3937,20 +3937,17 @@ static struct ggml_tensor * llm_build_ffn( } static struct ggml_tensor * llm_build_ffn_mpt_awq( - struct ggml_context *ctx, - struct ggml_tensor *cur, - struct ggml_tensor *up, - struct ggml_tensor *up_b, - struct ggml_tensor *gate, - struct ggml_tensor *gate_b, - struct ggml_tensor *down, - struct ggml_tensor *down_b, - struct ggml_tensor *act_scales, - llm_ffn_op_type type_op, - llm_ffn_gate_type type_gate, - const llm_build_cb &cb, - int il) -{ + struct ggml_context * ctx, + struct ggml_tensor * cur, + struct ggml_tensor * up, + struct ggml_tensor * up_b, + struct ggml_tensor * down, + struct ggml_tensor * down_b, + struct ggml_tensor * act_scales, + llm_ffn_op_type type_op, + llm_ffn_gate_type type_gate, + const llm_build_cb & cb, + int il) { struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur); cb(tmp, "ffn_up", il);