uppdate: remove use_awq arg

2023-12-25 17:13:06 +07:00 · 2023-12-25 17:13:06 +07:00 · 13f60c417d
commit 13f60c417d
parent 2187a8debe
4 changed files with 24 additions and 75 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -149,8 +149,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.seed = std::stoul(argv[i]);
-        } else if (arg == "--use-awq") {
-            params.use_awq = true;
        } else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_param = true;
@ -811,7 +809,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        (can be specified more than once for multiple prompts).\n");
    printf("  --color               colorise output to distinguish prompt and user input from generations\n");
    printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    printf("  --use-awq             Using AWQ quantization model in inferences\n");
    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
    printf("  -tb N, --threads-batch N\n");
    printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
@ -1021,7 +1018,6 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
-    mparams.use_awq         = params.use_awq;
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
    } else {
--- a/common/common.h
+++ b/common/common.h
@ -125,7 +125,6 @@ struct gpt_params {
    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
-    bool use_awq           = false; // use AWQ quantization infer

    std::string cache_type_k = "f16"; // KV cache data type for the K
    std::string cache_type_v = "f16"; // KV cache data type for the V
--- a/llama.cpp
+++ b/llama.cpp
@ -1261,7 +1261,6 @@ struct llama_hparams {
    float f_clamp_kqv;
    float f_max_alibi_bias;

-    bool use_awq;

    bool operator!=(const llama_hparams & other) const {
        if (this->vocab_only    != other.vocab_only)    return true;
@ -3478,9 +3477,9 @@ static bool llm_load_tensors(

                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        if (model.hparams.use_awq) {
-                            layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend);
-                        }
+                        
+                        // AWQ ScaleActivation layer
+                        layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false);
                    }
                } break;
            case LLM_ARCH_STABLELM:
@ -3754,7 +3753,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
        llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);

        model.hparams.vocab_only = params.vocab_only;
-        model.hparams.use_awq    = params.use_awq;

        llm_load_arch   (ml, model);
        llm_load_hparams(ml, model);
@ -3800,7 +3798,6 @@ enum llm_rope_type {
 enum llm_ffn_op_type {
    LLM_FFN_SILU,
    LLM_FFN_GELU,
-    LLM_FFN_GELU_ACT,
    LLM_FFN_RELU,
    LLM_FFN_RELU_SQR,
 };
@ -3968,6 +3965,7 @@ static struct ggml_tensor * llm_build_ffn(
         struct ggml_tensor * gate_b,
         struct ggml_tensor * down,
         struct ggml_tensor * down_b,
+         struct ggml_tensor * act_scales,
            llm_ffn_op_type   type_op,
          llm_ffn_gate_type   type_gate,
         const llm_build_cb & cb,
@ -4012,6 +4010,10 @@ static struct ggml_tensor * llm_build_ffn(
            {
                cur = ggml_gelu(ctx, cur);
                cb(cur, "ffn_gelu", il);
+                if (act_scales != NULL) {
+                    cur = ggml_div(ctx, cur, act_scales);
+                    cb(cur, "ffn_act", il);
+                }
            } break;
        case LLM_FFN_RELU:
            {
@ -4045,55 +4047,6 @@ static struct ggml_tensor * llm_build_ffn(
    return cur;
 }

-static struct ggml_tensor * llm_build_ffn_mpt_awq(
-        struct ggml_context * ctx,
-         struct ggml_tensor * cur,
-         struct ggml_tensor * up,
-         struct ggml_tensor * up_b,
-         struct ggml_tensor * down,
-         struct ggml_tensor * down_b,
-         struct ggml_tensor * act_scales,
-            llm_ffn_op_type   type_op,
-          llm_ffn_gate_type   type_gate,
-         const llm_build_cb & cb,
-                        int   il) {
-    struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
-    cb(tmp, "ffn_up", il);
-
-    if (up_b) {
-        tmp = ggml_add(ctx, tmp, up_b);
-        cb(tmp, "ffn_up_b", il);
-    }
-
-    cur = tmp;
-
-    switch (type_op) {
-        case LLM_FFN_GELU_ACT:
-            {
-                cur = ggml_gelu(ctx, cur);
-                cb(cur, "ffn_relu", il);
-                cur = ggml_div(ctx, cur, act_scales);
-                cb(cur, "ffn_div(gelu)", il);
-            } break;
-    }
-
-    if (type_gate == LLM_FFN_PAR) {
-        cur = ggml_mul(ctx, cur, tmp);
-        cb(cur, "ffn_gate_par", il);
-    }
-
-    cur = ggml_mul_mat(ctx, down, cur);
-    if (down_b) {
-        cb(cur, "ffn_down", il);
-    }
-
-    if (down_b) {
-        cur = ggml_add(ctx, cur, down_b);
-    }
-
-    return cur;
-}
-
 // if max_alibi_bias > 0 then apply ALiBi
 static struct ggml_tensor * llm_build_kqv(
        struct ggml_context * ctx,
@ -4379,6 +4332,7 @@ struct llm_build_context {
                        model.layers[il].ffn_up,   NULL,
                        model.layers[il].ffn_gate, NULL,
                        model.layers[il].ffn_down, NULL,
+                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                cb(cur, "ffn_out", il);
            } else {
@ -4558,6 +4512,7 @@ struct llm_build_context {
                        model.layers[il].ffn_up,   NULL,
                        model.layers[il].ffn_gate, NULL,
                        model.layers[il].ffn_down, NULL,
+                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                cb(cur, "ffn_out", il);
            }
@ -4672,6 +4627,7 @@ struct llm_build_context {
                        model.layers[il].ffn_up,   NULL,
                        NULL,                      NULL,
                        model.layers[il].ffn_down, NULL,
+                        NULL,
                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                cb(cur, "ffn_out", il);
            }
@ -4776,6 +4732,7 @@ struct llm_build_context {
                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                        NULL,                      NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        NULL,
                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                cb(cur, "ffn_out", il);
            }
@ -4980,6 +4937,7 @@ struct llm_build_context {
                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                        NULL,                      NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        NULL,
                        LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
                cb(cur, "ffn_out", il);
            }
@ -5066,6 +5024,7 @@ struct llm_build_context {
                        model.layers[il].ffn_up,   NULL,
                        model.layers[il].ffn_gate, NULL,
                        model.layers[il].ffn_down, NULL,
+                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                cb(cur, "ffn_out", il);
            }
@ -5161,6 +5120,7 @@ struct llm_build_context {
                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                        NULL,                      NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        NULL,
                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                cb(cur, "ffn_out", il);
            }
@ -5246,19 +5206,12 @@ struct llm_build_context {
                        NULL,
                        LLM_NORM, cb, il);
                cb(cur, "ffn_norm", il);
-                if (hparams.use_awq) {
-                    cur = llm_build_ffn_mpt_awq(ctx0, cur,
-                            model.layers[il].ffn_up,   NULL,
-                            model.layers[il].ffn_down, NULL,
-                            model.layers[il].ffn_act,
-                            LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il);
-                } else {
-                    cur = llm_build_ffn(ctx0, cur,
-                            model.layers[il].ffn_up,   NULL,
-                            NULL,                      NULL,
-                            model.layers[il].ffn_down, NULL,
-                            LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-                }
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   NULL,
+                        NULL,                      NULL,
+                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_act,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                cb(cur, "ffn_out", il);
            }

@ -5366,6 +5319,7 @@ struct llm_build_context {
                        model.layers[il].ffn_up,   NULL,
                        model.layers[il].ffn_gate, NULL,
                        model.layers[il].ffn_down, NULL,
+                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                cb(cur, "ffn_out", il);
            }
@ -5478,6 +5432,7 @@ struct llm_build_context {
                        model.layers[il].ffn_up,   NULL,
                        model.layers[il].ffn_gate, NULL,
                        model.layers[il].ffn_down, NULL,
+                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                cb(cur, "ffn_out", il);
            }
@ -5585,6 +5540,7 @@ struct llm_build_context {
                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                        NULL,                      NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        NULL,
                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                cb(ffn_output, "ffn_out", il);
            }
@ -9111,7 +9067,6 @@ struct llama_model_params llama_model_default_params() {
        /*.vocab_only                  =*/ false,
        /*.use_mmap                    =*/ true,
        /*.use_mlock                   =*/ false,
-        /*.use_awq                     =*/ false,
    };

 #ifdef GGML_USE_METAL
--- a/llama.h
+++ b/llama.h
@ -195,7 +195,6 @@ extern "C" {
        bool vocab_only; // only load the vocabulary, no weights
        bool use_mmap;   // use mmap if possible
        bool use_mlock;  // force system to keep model in RAM
-        bool use_awq;    // whether to use awq quantization
    };

    struct llama_context_params {