From 8177ad4e374dc5605c410b48aace1d6f1f52ea90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tr=E1=BA=A7n=20=C4=90=E1=BB=A9c=20Nam?= <v.namtd12@vinai.io>
Date: Tue, 19 Dec 2023 23:25:00 +0700
Subject: [PATCH] update: work for bot mpt and awqmpt

---
 common/common.cpp     |  6 +++--
 common/common.h       |  1 +
 convert-hf-to-gguf.py |  9 ++++----
 llama.cpp             | 52 ++++++++++++++++++++++++++++++-------------
 llama.h               |  1 +
 5 files changed, 48 insertions(+), 21 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index 4a61ae593..97002329a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -149,6 +149,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.seed = std::stoul(argv[i]);
+        } else if (arg == "-awq" || arg == "--use-awq") {
+            params.use_awq = true;
         } else if (arg == "-t" || arg == "--threads") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -804,6 +806,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        (can be specified more than once for multiple prompts).\n");
     printf("  --color               colorise output to distinguish prompt and user input from generations\n");
     printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
+    printf("  -awq SEED, -use-awq   Using AWQ quantization model in inferences\n");
     printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
     printf("  -tb N, --threads-batch N\n");
     printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
@@ -1013,6 +1016,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.tensor_split    = params.tensor_split;
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
+    mparams.use_awq         = params.use_awq;
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
     } else {
@@ -1096,13 +1100,11 @@ void llama_batch_add(
 
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
     auto mparams = llama_model_params_from_gpt_params(params);
-
     llama_model * model  = llama_load_model_from_file(params.model.c_str(), mparams);
     if (model == NULL) {
         fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
         return std::make_tuple(nullptr, nullptr);
     }
-
     auto cparams = llama_context_params_from_gpt_params(params);
 
     llama_context * lctx = llama_new_context_with_model(model, cparams);
diff --git a/common/common.h b/common/common.h
index e87ce1133..a2046d70f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -125,6 +125,7 @@ struct gpt_params {
     bool infill            = false; // use infill mode
     bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
     bool no_kv_offload     = false; // disable KV offloading
+    bool use_awq           = false; // use AWQ quantization infer
 
     std::string cache_type_k = "f16"; // KV cache data type for the K
     std::string cache_type_v = "f16"; // KV cache data type for the V
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index a180c73bd..3bf8a9f13 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -46,7 +46,7 @@ class Model:
         self.part_names = self._get_part_names()
         self.hparams = Model.load_hparams(self.dir_model)
         self.model_arch = self._get_model_architecture()
-        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess)
+        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
 
     def set_vocab(self):
         self._set_vocab_gpt2()
@@ -59,7 +59,7 @@ class Model:
                 from safetensors import safe_open
                 ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
             else:
-                ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
+                ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", weights_only=True))
 
             with ctx as model_part:
                 for name in model_part.keys():
@@ -444,7 +444,7 @@ class MPTModel(Model):
             # map tensor names
             if "scales" in name:
                 new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
-                new_name = new_name + ".scales"
+                new_name = new_name.replace("scales", "act.scales")
             else:
                 new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
             if new_name is None:
@@ -1001,6 +1001,7 @@ dir_model = args.model
 if args.awq_path:
     from awqpy.apply_awq import add_scale_weights
     tmp_model_path = args.model / "weighted_model"
+    dir_model = tmp_model_path
     if tmp_model_path.is_dir():
         print(f"{tmp_model_path} exists as a weighted model.")
     else:
@@ -1008,7 +1009,6 @@ if args.awq_path:
         print("Saving new weighted model ...")
         add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
         print(f"Saved weighted model at {tmp_model_path}.") 
-        dir_model = tmp_model_path
 
 if not dir_model.is_dir():
     print(f'Error: {args.model} is not a directory', file=sys.stderr)
@@ -1029,6 +1029,7 @@ print(f"Loading model: {dir_model.name}")
 
 hparams = Model.load_hparams(dir_model)
 
+
 with torch.inference_mode():
     model_class = Model.from_model_architecture(hparams["architectures"][0])
     model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
diff --git a/llama.cpp b/llama.cpp
index 81c99fc3b..a52a4e2d3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1178,6 +1178,7 @@ struct llama_hparams {
 
     float f_clamp_kqv;
     float f_max_alibi_bias;
+    bool use_awq;
 
     bool operator!=(const llama_hparams & other) const {
         if (this->vocab_only  != other.vocab_only)  return true;
@@ -3379,7 +3380,6 @@ static void llm_load_tensors(
             case LLM_ARCH_MPT:
                 {
                     model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-
                     // output
                     {
                         ggml_backend_type backend_norm;
@@ -3423,18 +3423,31 @@ static void llm_load_tensors(
                         layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
 
                         layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend);
+                        if (model.hparams.use_awq) {
+                            layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend);
+                        }
                         layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
 
                         if (backend == GGML_BACKEND_GPU) {
-                            vram_weights +=
+                            if (model.hparams.use_awq) {
+                                vram_weights +=
                                 ggml_nbytes(layer.attn_norm) +
                                 ggml_nbytes(layer.wqkv)      +
                                 ggml_nbytes(layer.wo)        +
                                 ggml_nbytes(layer.ffn_norm)  +
                                 ggml_nbytes(layer.ffn_down)  +
-                                ggml_nbytes(layer.ffn_act) +
+                                ggml_nbytes(layer.ffn_act)   +
                                 ggml_nbytes(layer.ffn_up);
+                            }
+                            else {
+                                vram_weights +=
+                                ggml_nbytes(layer.attn_norm) +
+                                ggml_nbytes(layer.wqkv)      +
+                                ggml_nbytes(layer.wo)        +
+                                ggml_nbytes(layer.ffn_norm)  +
+                                ggml_nbytes(layer.ffn_down)  +
+                                ggml_nbytes(layer.ffn_up);
+                            }
                         }
                     }
                 } break;
@@ -3634,7 +3647,7 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
         llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
 
         model.hparams.vocab_only = params.vocab_only;
-
+        model.hparams.use_awq = params.use_awq;
         llm_load_arch   (ml, model);
         llm_load_hparams(ml, model);
         llm_load_vocab  (ml, model);
@@ -5119,13 +5132,23 @@ struct llm_build_context {
                         NULL,
                         LLM_NORM, cb, il);
                 cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, NULL,
-                        model.layers[il].ffn_act,
-                        LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il);
+                if (hparams.use_awq) {
+                    cur = llm_build_ffn(ctx0, cur,
+                            model.layers[il].ffn_up,   NULL,
+                            NULL,                      NULL,
+                            model.layers[il].ffn_down, NULL,
+                            model.layers[il].ffn_act,
+                            LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il);
+                    
+                }
+                else {
+                    cur = llm_build_ffn(ctx0, cur,
+                            model.layers[il].ffn_up,   NULL,
+                            NULL,                      NULL,
+                            model.layers[il].ffn_down, NULL,
+                            LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                    
+                }
                 cb(cur, "ffn_out", il);
             }
 
@@ -8841,6 +8864,7 @@ struct llama_model_params llama_model_default_params() {
         /*.progress_callback_user_data =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
         /*.vocab_only                  =*/ false,
+        /*.use_awq                     =*/ false,
         /*.use_mmap                    =*/ true,
         /*.use_mlock                   =*/ false,
     };
@@ -8936,9 +8960,7 @@ struct llama_model * llama_load_model_from_file(
                              const char * path_model,
               struct llama_model_params   params) {
     ggml_time_init();
-
     llama_model * model = new llama_model;
-
     unsigned cur_percentage = 0;
     if (params.progress_callback == NULL) {
         params.progress_callback_user_data = &cur_percentage;
@@ -9065,7 +9087,7 @@ struct llama_context * llama_new_context_with_model(
         if (params.embedding){
             ctx->embedding.resize(hparams.n_embd);
         }
-
+        
         {
             static const size_t tensor_alignment = 32;
             // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
diff --git a/llama.h b/llama.h
index b1f5fca62..fd2d3920d 100644
--- a/llama.h
+++ b/llama.h
@@ -192,6 +192,7 @@ extern "C" {
         bool vocab_only; // only load the vocabulary, no weights
         bool use_mmap;   // use mmap if possible
         bool use_mlock;  // force system to keep model in RAM
+        bool use_awq;  // whether to use awq quantization
     };
 
     struct llama_context_params {