Formatted other files

2023-12-20 11:04:03 +07:00 · 2023-12-20 11:04:03 +07:00 · c02f6df7c4
commit c02f6df7c4
parent 0610672b19
6 changed files with 19 additions and 19 deletions
--- a/awqpy/README.md
+++ b/awqpy/README.md
@ -56,7 +56,7 @@ The perplexity measurements in table above are done against the `wikitext2` test

 ## Results
 Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
-We use three types of llamacpp quantization methods to work with our version, including q4, q4_1, and q2_k
+We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k

 ### Llama 7B (Build with OpenBLAS)

--- a/common/common.cpp
+++ b/common/common.cpp
@ -150,6 +150,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            }
            params.seed = std::stoul(argv[i]);
        } else if (arg == "-awq" || arg == "--use-awq") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
            params.use_awq = true;
        } else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
@ -806,7 +810,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        (can be specified more than once for multiple prompts).\n");
    printf("  --color               colorise output to distinguish prompt and user input from generations\n");
    printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    printf("  -awq SEED, -use-awq   Using AWQ quantization model in inferences\n");
+    printf("  -awq, --use-awq       Using AWQ quantization model in inferences\n");
    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
    printf("  -tb N, --threads-batch N\n");
    printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -1029,7 +1029,6 @@ print(f"Loading model: {dir_model.name}")

 hparams = Model.load_hparams(dir_model)

-
 with torch.inference_mode():
    model_class = Model.from_model_architecture(hparams["architectures"][0])
    model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -164,7 +164,7 @@ class TensorNameMap:
            "transformer.h.{bid}.mlp.w1",                             # qwen
        ),
        
-        # Awq-activation gate
+        # AWQ-activation gate
        MODEL_TENSOR.FFN_ACT: (
            "transformer.blocks.{bid}.ffn.act",  # mpt
        ),
--- a/llama.cpp
+++ b/llama.cpp
@ -3941,16 +3941,13 @@ static struct ggml_tensor * llm_build_ffn_mpt_awq(
         struct ggml_tensor * cur,
         struct ggml_tensor * up,
         struct ggml_tensor * up_b,
-    struct ggml_tensor *gate,
-    struct ggml_tensor *gate_b,
         struct ggml_tensor * down,
         struct ggml_tensor * down_b,
         struct ggml_tensor * act_scales,
            llm_ffn_op_type   type_op,
          llm_ffn_gate_type   type_gate,
         const llm_build_cb & cb,
-    int il)
-{
+                        int   il) {
    struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
    cb(tmp, "ffn_up", il);