update: work for bot mpt and awqmpt

2023-12-19 23:25:00 +07:00 · 2023-12-19 23:25:00 +07:00 · 8177ad4e37
commit 8177ad4e37
parent 8fece75e35
5 changed files with 48 additions and 21 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -149,6 +149,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.seed = std::stoul(argv[i]);
+        } else if (arg == "-awq" || arg == "--use-awq") {
+            params.use_awq = true;
        } else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_param = true;
@ -804,6 +806,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        (can be specified more than once for multiple prompts).\n");
    printf("  --color               colorise output to distinguish prompt and user input from generations\n");
    printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
+    printf("  -awq SEED, -use-awq   Using AWQ quantization model in inferences\n");
    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
    printf("  -tb N, --threads-batch N\n");
    printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
@ -1013,6 +1016,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
+    mparams.use_awq         = params.use_awq;
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
    } else {
@ -1096,13 +1100,11 @@ void llama_batch_add(

 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
    auto mparams = llama_model_params_from_gpt_params(params);
-
    llama_model * model  = llama_load_model_from_file(params.model.c_str(), mparams);
    if (model == NULL) {
        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
        return std::make_tuple(nullptr, nullptr);
    }
-
    auto cparams = llama_context_params_from_gpt_params(params);

    llama_context * lctx = llama_new_context_with_model(model, cparams);
--- a/common/common.h
+++ b/common/common.h
@ -125,6 +125,7 @@ struct gpt_params {
    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
+    bool use_awq           = false; // use AWQ quantization infer

    std::string cache_type_k = "f16"; // KV cache data type for the K
    std::string cache_type_v = "f16"; // KV cache data type for the V
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -46,7 +46,7 @@ class Model:
        self.part_names = self._get_part_names()
        self.hparams = Model.load_hparams(self.dir_model)
        self.model_arch = self._get_model_architecture()
-        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess)
+        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)

    def set_vocab(self):
        self._set_vocab_gpt2()
@ -59,7 +59,7 @@ class Model:
                from safetensors import safe_open
                ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
            else:
-                ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
+                ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", weights_only=True))

            with ctx as model_part:
                for name in model_part.keys():
@ -444,7 +444,7 @@ class MPTModel(Model):
            # map tensor names
            if "scales" in name:
                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
-                new_name = new_name + ".scales"
+                new_name = new_name.replace("scales", "act.scales")
            else:
                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
@ -1001,6 +1001,7 @@ dir_model = args.model
 if args.awq_path:
    from awqpy.apply_awq import add_scale_weights
    tmp_model_path = args.model / "weighted_model"
+    dir_model = tmp_model_path
    if tmp_model_path.is_dir():
        print(f"{tmp_model_path} exists as a weighted model.")
    else:
@ -1008,7 +1009,6 @@ if args.awq_path:
        print("Saving new weighted model ...")
        add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
        print(f"Saved weighted model at {tmp_model_path}.") 
-        dir_model = tmp_model_path

 if not dir_model.is_dir():
    print(f'Error: {args.model} is not a directory', file=sys.stderr)
@ -1029,6 +1029,7 @@ print(f"Loading model: {dir_model.name}")

 hparams = Model.load_hparams(dir_model)

+
 with torch.inference_mode():
    model_class = Model.from_model_architecture(hparams["architectures"][0])
    model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
--- a/llama.cpp
+++ b/llama.cpp
@ -1178,6 +1178,7 @@ struct llama_hparams {

    float f_clamp_kqv;
    float f_max_alibi_bias;
+    bool use_awq;

    bool operator!=(const llama_hparams & other) const {
        if (this->vocab_only  != other.vocab_only)  return true;
@ -3379,7 +3380,6 @@ static void llm_load_tensors(
            case LLM_ARCH_MPT:
                {
                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-
                    // output
                    {
                        ggml_backend_type backend_norm;
@ -3423,18 +3423,31 @@ static void llm_load_tensors(
                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);

                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend);
+                        if (model.hparams.use_awq) {
+                            layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend);
+                        }
                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);

                        if (backend == GGML_BACKEND_GPU) {
-                            vram_weights +=
+                            if (model.hparams.use_awq) {
+                                vram_weights +=
                                ggml_nbytes(layer.attn_norm) +
                                ggml_nbytes(layer.wqkv)      +
                                ggml_nbytes(layer.wo)        +
                                ggml_nbytes(layer.ffn_norm)  +
                                ggml_nbytes(layer.ffn_down)  +
-                                ggml_nbytes(layer.ffn_act) +
+                                ggml_nbytes(layer.ffn_act)   +
                                ggml_nbytes(layer.ffn_up);
+                            }
+                            else {
+                                vram_weights +=
+                                ggml_nbytes(layer.attn_norm) +
+                                ggml_nbytes(layer.wqkv)      +
+                                ggml_nbytes(layer.wo)        +
+                                ggml_nbytes(layer.ffn_norm)  +
+                                ggml_nbytes(layer.ffn_down)  +
+                                ggml_nbytes(layer.ffn_up);
+                            }
                        }
                    }
                } break;
@ -3634,7 +3647,7 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
        llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);

        model.hparams.vocab_only = params.vocab_only;
-
+        model.hparams.use_awq = params.use_awq;
        llm_load_arch   (ml, model);
        llm_load_hparams(ml, model);
        llm_load_vocab  (ml, model);
@ -5119,13 +5132,23 @@ struct llm_build_context {
                        NULL,
                        LLM_NORM, cb, il);
                cb(cur, "ffn_norm", il);
-
-                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
-                        NULL,                      NULL,
-                        model.layers[il].ffn_down, NULL,
-                        model.layers[il].ffn_act,
-                        LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il);
+                if (hparams.use_awq) {
+                    cur = llm_build_ffn(ctx0, cur,
+                            model.layers[il].ffn_up,   NULL,
+                            NULL,                      NULL,
+                            model.layers[il].ffn_down, NULL,
+                            model.layers[il].ffn_act,
+                            LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il);
+                    
+                }
+                else {
+                    cur = llm_build_ffn(ctx0, cur,
+                            model.layers[il].ffn_up,   NULL,
+                            NULL,                      NULL,
+                            model.layers[il].ffn_down, NULL,
+                            LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                    
+                }
                cb(cur, "ffn_out", il);
            }

@ -8841,6 +8864,7 @@ struct llama_model_params llama_model_default_params() {
        /*.progress_callback_user_data =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
        /*.vocab_only                  =*/ false,
+        /*.use_awq                     =*/ false,
        /*.use_mmap                    =*/ true,
        /*.use_mlock                   =*/ false,
    };
@ -8936,9 +8960,7 @@ struct llama_model * llama_load_model_from_file(
                             const char * path_model,
              struct llama_model_params   params) {
    ggml_time_init();
-
    llama_model * model = new llama_model;
-
    unsigned cur_percentage = 0;
    if (params.progress_callback == NULL) {
        params.progress_callback_user_data = &cur_percentage;
@ -9065,7 +9087,7 @@ struct llama_context * llama_new_context_with_model(
        if (params.embedding){
            ctx->embedding.resize(hparams.n_embd);
        }
-
+        
        {
            static const size_t tensor_alignment = 32;
            // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
--- a/llama.h
+++ b/llama.h
@ -192,6 +192,7 @@ extern "C" {
        bool vocab_only; // only load the vocabulary, no weights
        bool use_mmap;   // use mmap if possible
        bool use_mlock;  // force system to keep model in RAM
+        bool use_awq;  // whether to use awq quantization
    };

    struct llama_context_params {