From d94eaa69d1a352b65d499909bb776981ff98cfdb Mon Sep 17 00:00:00 2001 From: zzx Date: Mon, 24 Jun 2024 10:39:22 +0800 Subject: [PATCH 1/2] By changing priorty between --token_embedding_type, --output_tensor_type and --pure, it is more friendly for user to define own quantization strategy --- examples/quantize/quantize.cpp | 4 ++-- llama.cpp | 6 +++--- "\177\177" | 28 ++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 5 deletions(-) create mode 100644 "\177\177" diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 28584e14b..f90f1c145 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -101,9 +101,9 @@ static void usage(const char * executable) { printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); - printf(" --keep-split: will generate quatized model in the same shards as input"); + printf(" --keep-split: will generate quatized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); - printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); + printf(" : Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n"); printf("\nAllowed quantization types:\n"); for (auto & it : QUANT_OPTIONS) { diff --git a/llama.cpp b/llama.cpp index 8818c6928..51f1444ac 100644 --- a/llama.cpp +++ b/llama.cpp @@ -15691,15 +15691,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type = default_type; // get more optimal quantization type based on the tensor shape, layer, etc. - if (!params->pure && ggml_is_quantized(default_type)) { - new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); - } if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { new_type = params->token_embedding_type; } if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { new_type = params->output_tensor_type; } + if (!params->pure && ggml_is_quantized(default_type)) { + new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); + } // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. diff --git "a/\177\177" "b/\177\177" new file mode 100644 index 000000000..f8c08c23b --- /dev/null +++ "b/\177\177" @@ -0,0 +1,28 @@ +export PATH=/usr/local/cuda-12.2/bin${PATH:+:${PATH}} +export LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} +export PATH=/local/mnt/workspace/miniconda3/bin:$PATH +export CUDA_VISIBLE_DEVICES=0,1,2,3ii: +: +: +# >>> conda initialize >>> +# !! Contents within this block are managed by 'conda init' !! +__conda_setup="$('/local/mnt/workspace/miniconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" +if [ $? -eq 0 ]; then + eval "$__conda_setup" +else + if [ -f "/local/mnt/workspace/miniconda3/etc/profile.d/conda.sh" ]; then + . "/local/mnt/workspace/miniconda3/etc/profile.d/conda.sh" + else + export PATH="/local/mnt/workspace/miniconda3/bin:$PATH" + fi +fi +unset __conda_setup +# <<< conda initialize <<< + +# proxy setting +export http_proxy=http://secure-proxy-aprdc2-1.qualcomm.com:9090/ +export https_proxy=http://secure-proxy-aprdc2-1.qualcomm.com:9090/ +export ftp_proxy=http://secure-proxy-aprdc2-1.qualcomm.com:9090/ + +# HF cache +export HF_HOME=/local/mnt/zzx/.cache/huggingface From 7af279105eab10c0625a7377416a7d5dd5b7ed82 Mon Sep 17 00:00:00 2001 From: zzx Date: Wed, 26 Jun 2024 14:57:33 +0800 Subject: [PATCH 2/2] change some details of command help info --- examples/quantize/quantize.cpp | 6 +++--- "\177\177" | 28 ---------------------------- 2 files changed, 3 insertions(+), 31 deletions(-) delete mode 100644 "\177\177" diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 99057e932..539606274 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -46,9 +46,9 @@ static const std::vector QUANT_OPTIONS = { { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, - { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, - { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, - { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, + { "F16", LLAMA_FTYPE_MOSTLY_F16, "15.02G, +0.0000 ppl @ Llama-3-8B", }, + { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "15.02G, -0.0025 ppl @ Llama-3-8B", }, + { "F32", LLAMA_FTYPE_ALL_F32, "30.06G @ Llama-3-8B", }, // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; diff --git "a/\177\177" "b/\177\177" deleted file mode 100644 index f8c08c23b..000000000 --- "a/\177\177" +++ /dev/null @@ -1,28 +0,0 @@ -export PATH=/usr/local/cuda-12.2/bin${PATH:+:${PATH}} -export LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} -export PATH=/local/mnt/workspace/miniconda3/bin:$PATH -export CUDA_VISIBLE_DEVICES=0,1,2,3ii: -: -: -# >>> conda initialize >>> -# !! Contents within this block are managed by 'conda init' !! -__conda_setup="$('/local/mnt/workspace/miniconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" -if [ $? -eq 0 ]; then - eval "$__conda_setup" -else - if [ -f "/local/mnt/workspace/miniconda3/etc/profile.d/conda.sh" ]; then - . "/local/mnt/workspace/miniconda3/etc/profile.d/conda.sh" - else - export PATH="/local/mnt/workspace/miniconda3/bin:$PATH" - fi -fi -unset __conda_setup -# <<< conda initialize <<< - -# proxy setting -export http_proxy=http://secure-proxy-aprdc2-1.qualcomm.com:9090/ -export https_proxy=http://secure-proxy-aprdc2-1.qualcomm.com:9090/ -export ftp_proxy=http://secure-proxy-aprdc2-1.qualcomm.com:9090/ - -# HF cache -export HF_HOME=/local/mnt/zzx/.cache/huggingface