From 8177ad4e374dc5605c410b48aace1d6f1f52ea90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tr=E1=BA=A7n=20=C4=90=E1=BB=A9c=20Nam?= Date: Tue, 19 Dec 2023 23:25:00 +0700 Subject: [PATCH] update: work for bot mpt and awqmpt --- common/common.cpp | 6 +++-- common/common.h | 1 + convert-hf-to-gguf.py | 9 ++++---- llama.cpp | 52 ++++++++++++++++++++++++++++++------------- llama.h | 1 + 5 files changed, 48 insertions(+), 21 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 4a61ae593..97002329a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -149,6 +149,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.seed = std::stoul(argv[i]); + } else if (arg == "-awq" || arg == "--use-awq") { + params.use_awq = true; } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; @@ -804,6 +806,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" (can be specified more than once for multiple prompts).\n"); printf(" --color colorise output to distinguish prompt and user input from generations\n"); printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); + printf(" -awq SEED, -use-awq Using AWQ quantization model in inferences\n"); printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); printf(" -tb N, --threads-batch N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); @@ -1013,6 +1016,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; + mparams.use_awq = params.use_awq; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; } else { @@ -1096,13 +1100,11 @@ void llama_batch_add( std::tuple llama_init_from_gpt_params(gpt_params & params) { auto mparams = llama_model_params_from_gpt_params(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); return std::make_tuple(nullptr, nullptr); } - auto cparams = llama_context_params_from_gpt_params(params); llama_context * lctx = llama_new_context_with_model(model, cparams); diff --git a/common/common.h b/common/common.h index e87ce1133..a2046d70f 100644 --- a/common/common.h +++ b/common/common.h @@ -125,6 +125,7 @@ struct gpt_params { bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading + bool use_awq = false; // use AWQ quantization infer std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_v = "f16"; // KV cache data type for the V diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index a180c73bd..3bf8a9f13 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -46,7 +46,7 @@ class Model: self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) self.model_arch = self._get_model_architecture() - self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess) + self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) def set_vocab(self): self._set_vocab_gpt2() @@ -59,7 +59,7 @@ class Model: from safetensors import safe_open ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) else: - ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) + ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", weights_only=True)) with ctx as model_part: for name in model_part.keys(): @@ -444,7 +444,7 @@ class MPTModel(Model): # map tensor names if "scales" in name: new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) - new_name = new_name + ".scales" + new_name = new_name.replace("scales", "act.scales") else: new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: @@ -1001,6 +1001,7 @@ dir_model = args.model if args.awq_path: from awqpy.apply_awq import add_scale_weights tmp_model_path = args.model / "weighted_model" + dir_model = tmp_model_path if tmp_model_path.is_dir(): print(f"{tmp_model_path} exists as a weighted model.") else: @@ -1008,7 +1009,6 @@ if args.awq_path: print("Saving new weighted model ...") add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) print(f"Saved weighted model at {tmp_model_path}.") - dir_model = tmp_model_path if not dir_model.is_dir(): print(f'Error: {args.model} is not a directory', file=sys.stderr) @@ -1029,6 +1029,7 @@ print(f"Loading model: {dir_model.name}") hparams = Model.load_hparams(dir_model) + with torch.inference_mode(): model_class = Model.from_model_architecture(hparams["architectures"][0]) model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) diff --git a/llama.cpp b/llama.cpp index 81c99fc3b..a52a4e2d3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1178,6 +1178,7 @@ struct llama_hparams { float f_clamp_kqv; float f_max_alibi_bias; + bool use_awq; bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; @@ -3379,7 +3380,6 @@ static void llm_load_tensors( case LLM_ARCH_MPT: { model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - // output { ggml_backend_type backend_norm; @@ -3423,18 +3423,31 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend); + if (model.hparams.use_awq) { + layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend); + } layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { - vram_weights += + if (model.hparams.use_awq) { + vram_weights += ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_down) + - ggml_nbytes(layer.ffn_act) + + ggml_nbytes(layer.ffn_act) + ggml_nbytes(layer.ffn_up); + } + else { + vram_weights += + ggml_nbytes(layer.attn_norm) + + ggml_nbytes(layer.wqkv) + + ggml_nbytes(layer.wo) + + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.ffn_down) + + ggml_nbytes(layer.ffn_up); + } } } } break; @@ -3634,7 +3647,7 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); model.hparams.vocab_only = params.vocab_only; - + model.hparams.use_awq = params.use_awq; llm_load_arch (ml, model); llm_load_hparams(ml, model); llm_load_vocab (ml, model); @@ -5119,13 +5132,23 @@ struct llm_build_context { NULL, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, - NULL, NULL, - model.layers[il].ffn_down, NULL, - model.layers[il].ffn_act, - LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il); + if (hparams.use_awq) { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + model.layers[il].ffn_act, + LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il); + + } + else { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + + } cb(cur, "ffn_out", il); } @@ -8841,6 +8864,7 @@ struct llama_model_params llama_model_default_params() { /*.progress_callback_user_data =*/ nullptr, /*.kv_overrides =*/ nullptr, /*.vocab_only =*/ false, + /*.use_awq =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, }; @@ -8936,9 +8960,7 @@ struct llama_model * llama_load_model_from_file( const char * path_model, struct llama_model_params params) { ggml_time_init(); - llama_model * model = new llama_model; - unsigned cur_percentage = 0; if (params.progress_callback == NULL) { params.progress_callback_user_data = &cur_percentage; @@ -9065,7 +9087,7 @@ struct llama_context * llama_new_context_with_model( if (params.embedding){ ctx->embedding.resize(hparams.n_embd); } - + { static const size_t tensor_alignment = 32; // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data diff --git a/llama.h b/llama.h index b1f5fca62..fd2d3920d 100644 --- a/llama.h +++ b/llama.h @@ -192,6 +192,7 @@ extern "C" { bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM + bool use_awq; // whether to use awq quantization }; struct llama_context_params {