From d94eaa69d1a352b65d499909bb776981ff98cfdb Mon Sep 17 00:00:00 2001
From: zzx <zipppxuan@gmail.com>
Date: Mon, 24 Jun 2024 10:39:22 +0800
Subject: [PATCH 1/2] By changing priorty between --token_embedding_type,
 --output_tensor_type and --pure, it is more friendly for user to define own
 quantization strategy

---
 examples/quantize/quantize.cpp |  4 ++--
 llama.cpp                      |  6 +++---
 "\177\177"                     | 28 ++++++++++++++++++++++++++++
 3 files changed, 33 insertions(+), 5 deletions(-)
 create mode 100644 "\177\177"

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 28584e14b..f90f1c145 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -101,9 +101,9 @@ static void usage(const char * executable) {
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --keep-split: will generate quatized model in the same shards as input");
+    printf("  --keep-split: will generate quatized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
-    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
+    printf("  : Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n\n");
     printf("Note: --include-weights and --exclude-weights cannot be used together\n");
     printf("\nAllowed quantization types:\n");
     for (auto & it : QUANT_OPTIONS) {
diff --git a/llama.cpp b/llama.cpp
index 8818c6928..51f1444ac 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -15691,15 +15691,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             new_type = default_type;
 
             // get more optimal quantization type based on the tensor shape, layer, etc.
-            if (!params->pure && ggml_is_quantized(default_type)) {
-                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-            }
             if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                 new_type = params->token_embedding_type;
             }
             if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
                 new_type = params->output_tensor_type;
             }
+            if (!params->pure && ggml_is_quantized(default_type)) {
+                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+            }
 
             // If we've decided to quantize to the same type the tensor is already
             // in then there's nothing to do.
diff --git "a/\177\177" "b/\177\177"
new file mode 100644
index 000000000..f8c08c23b
--- /dev/null
+++ "b/\177\177"
@@ -0,0 +1,28 @@
+export PATH=/usr/local/cuda-12.2/bin${PATH:+:${PATH}}
+export LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+export PATH=/local/mnt/workspace/miniconda3/bin:$PATH
+export CUDA_VISIBLE_DEVICES=0,1,2,3ii:
+:
+:
+# >>> conda initialize >>>
+# !! Contents within this block are managed by 'conda init' !!
+__conda_setup="$('/local/mnt/workspace/miniconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
+if [ $? -eq 0 ]; then
+    eval "$__conda_setup"
+else
+    if [ -f "/local/mnt/workspace/miniconda3/etc/profile.d/conda.sh" ]; then
+        . "/local/mnt/workspace/miniconda3/etc/profile.d/conda.sh"
+    else
+        export PATH="/local/mnt/workspace/miniconda3/bin:$PATH"
+    fi
+fi
+unset __conda_setup
+# <<< conda initialize <<<
+
+# proxy setting
+export http_proxy=http://secure-proxy-aprdc2-1.qualcomm.com:9090/
+export https_proxy=http://secure-proxy-aprdc2-1.qualcomm.com:9090/
+export ftp_proxy=http://secure-proxy-aprdc2-1.qualcomm.com:9090/
+
+# HF cache
+export HF_HOME=/local/mnt/zzx/.cache/huggingface

From 7af279105eab10c0625a7377416a7d5dd5b7ed82 Mon Sep 17 00:00:00 2001
From: zzx <zipppxuan@gmail.com>
Date: Wed, 26 Jun 2024 14:57:33 +0800
Subject: [PATCH 2/2] change some details of command help info

---
 examples/quantize/quantize.cpp |  6 +++---
 "\177\177"                     | 28 ----------------------------
 2 files changed, 3 insertions(+), 31 deletions(-)
 delete mode 100644 "\177\177"

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 99057e932..539606274 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -46,9 +46,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
     { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
     { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
-    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, +0.0020 ppl @ Mistral-7B",  },
-    { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B",  },
-    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B",          },
+    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "15.02G, +0.0000 ppl @ Llama-3-8B",  },
+    { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "15.02G, -0.0025 ppl @ Llama-3-8B",  },
+    { "F32",    LLAMA_FTYPE_ALL_F32,       "30.06G              @ Llama-3-8B",  },
     // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
     { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing",  },
 };
diff --git "a/\177\177" "b/\177\177"
deleted file mode 100644
index f8c08c23b..000000000
--- "a/\177\177"
+++ /dev/null
@@ -1,28 +0,0 @@
-export PATH=/usr/local/cuda-12.2/bin${PATH:+:${PATH}}
-export LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
-export PATH=/local/mnt/workspace/miniconda3/bin:$PATH
-export CUDA_VISIBLE_DEVICES=0,1,2,3ii:
-:
-:
-# >>> conda initialize >>>
-# !! Contents within this block are managed by 'conda init' !!
-__conda_setup="$('/local/mnt/workspace/miniconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
-if [ $? -eq 0 ]; then
-    eval "$__conda_setup"
-else
-    if [ -f "/local/mnt/workspace/miniconda3/etc/profile.d/conda.sh" ]; then
-        . "/local/mnt/workspace/miniconda3/etc/profile.d/conda.sh"
-    else
-        export PATH="/local/mnt/workspace/miniconda3/bin:$PATH"
-    fi
-fi
-unset __conda_setup
-# <<< conda initialize <<<
-
-# proxy setting
-export http_proxy=http://secure-proxy-aprdc2-1.qualcomm.com:9090/
-export https_proxy=http://secure-proxy-aprdc2-1.qualcomm.com:9090/
-export ftp_proxy=http://secure-proxy-aprdc2-1.qualcomm.com:9090/
-
-# HF cache
-export HF_HOME=/local/mnt/zzx/.cache/huggingface