diff --git a/README.md b/README.md index 95471fdbb..bf3eb0b76 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,10 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ### Hot topics +- #### IMPORTANT: Tokenizer fixes and API change (developers and projects using `llama.cpp` built-in tokenization must read): https://github.com/ggerganov/llama.cpp/pull/2810 + +- GGUFv2 adds support for 64-bit sizes + backwards compatible: https://github.com/ggerganov/llama.cpp/pull/2821 + - Added support for Falcon models: https://github.com/ggerganov/llama.cpp/pull/2717 - A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398) diff --git a/ci/run.sh b/ci/run.sh index e1486e7c1..942b2e00c 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -196,17 +196,17 @@ function gg_run_open_llama_3b_v2 { (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log function check_ppl { qnt="$1" @@ -233,6 +233,48 @@ function gg_run_open_llama_3b_v2 { check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + # lora + function compare_ppl { + qnt="$1" + ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + + if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then + printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2" + return 20 + fi + + printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2" + return 0 + } + + path_lora="../models-mnt/open-llama/3B-v2/lora" + path_shakespeare="../models-mnt/shakespeare" + + shakespeare="${path_shakespeare}/shakespeare.txt" + lora_shakespeare="${path_lora}/ggml-adapter-model.bin" + + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin + gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt + + python3 ../convert-lora-to-ggml.py ${path_lora} + + # f16 + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log + compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + # q8_0 + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log + compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + # q8_0 + f16 lora-base + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log + compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + set +e } @@ -242,6 +284,7 @@ function gg_sum_open_llama_3b_v2 { gg_printf 'OpenLLaMA 3B-v2:\n' gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" @@ -253,6 +296,11 @@ function gg_sum_open_llama_3b_v2 { gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" + gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" + gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)" + gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)" + gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)" + gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" } # open_llama_7b_v2 @@ -310,17 +358,17 @@ function gg_run_open_llama_7b_v2 { ./bin/quantize ${model_f16} ${model_q5_k} q5_k ./bin/quantize ${model_f16} ${model_q6_k} q6_k - (time ./bin/main --model ${model_f16} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/main --model ${model_q8_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/main --model ${model_q4_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/main --model ${model_q4_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/main --model ${model_q5_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/main --model ${model_q5_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/main --model ${model_q2_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/main --model ${model_q3_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/main --model ${model_q4_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/main --model ${model_q5_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/main --model ${model_q6_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log @@ -359,6 +407,48 @@ function gg_run_open_llama_7b_v2 { check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + # lora + function compare_ppl { + qnt="$1" + ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + + if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then + printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2" + return 20 + fi + + printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2" + return 0 + } + + path_lora="../models-mnt/open-llama/7B-v2/lora" + path_shakespeare="../models-mnt/shakespeare" + + shakespeare="${path_shakespeare}/shakespeare.txt" + lora_shakespeare="${path_lora}/ggml-adapter-model.bin" + + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin + gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt + + python3 ../convert-lora-to-ggml.py ${path_lora} + + # f16 + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log + compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + # currently not supported by the CUDA backend + # q8_0 + #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log + #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log + #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + # q8_0 + f16 lora-base + #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log + #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + set +e } @@ -368,6 +458,7 @@ function gg_sum_open_llama_7b_v2 { gg_printf 'OpenLLaMA 7B-v2:\n' gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" @@ -379,6 +470,11 @@ function gg_sum_open_llama_7b_v2 { gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" + gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" + gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)" + #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)" + #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)" + #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" } ## main diff --git a/common/common.cpp b/common/common.cpp index ff19ec4e5..0d91a6a35 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -733,12 +733,12 @@ std::vector llama_tokenize( return result; } -std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { +std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_str(ctx, token, result.data(), result.size()); + int check = llama_token_to_piece(ctx, token, result.data(), result.size()); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -746,3 +746,36 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok return std::string(result.data(), result.size()); } + +std::string llama_detokenize_spm(llama_context * ctx, const std::vector & tokens) { + const llama_token bos_id = llama_token_bos(ctx); + + std::string piece; + std::string result; + + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + // remove the leading space of the first non-BOS token + if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') { + piece = piece.substr(1); + } + + result += piece; + } + + return result; +} + +std::string llama_detokenize_bpe(llama_context * ctx, const std::vector & tokens) { + std::string piece; + std::string result; + + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + result += piece; + } + + return result; +} diff --git a/common/common.h b/common/common.h index 17d271e67..97fda2be7 100644 --- a/common/common.h +++ b/common/common.h @@ -28,6 +28,7 @@ struct gpt_params { int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t n_beams = 0; // if non-zero then use beam search of given width. float rope_freq_base = 10000.0f; // RoPE base frequency float rope_freq_scale = 1.0f; // RoPE frequency scaling factor @@ -115,11 +116,31 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param // Vocab utils // +// tokenizes a string into a vector of tokens +// should work similar to Python's `tokenizer.encode` std::vector llama_tokenize( struct llama_context * ctx, const std::string & text, bool add_bos); -std::string llama_token_to_str( +// tokenizes a token into a piece +// should work similar to Python's `tokenizer.id_to_piece` +std::string llama_token_to_piece( const struct llama_context * ctx, llama_token token); + +// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function +// that takes into account the tokenizer type and decides how to handle the leading space +// +// detokenizes a vector of tokens into a string +// should work similar to Python's `tokenizer.decode` +// removes the leading space from the first non-BOS token +std::string llama_detokenize_spm( + llama_context * ctx, + const std::vector & tokens); + +// detokenizes a vector of tokens into a string +// should work similar to Python's `tokenizer.decode` +std::string llama_detokenize_bpe( + llama_context * ctx, + const std::vector & tokens); diff --git a/convert.py b/convert.py index 10276bf63..a15e6ccd2 100755 --- a/convert.py +++ b/convert.py @@ -3,6 +3,7 @@ import gguf import argparse import concurrent.futures +from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor import copy import enum import faulthandler @@ -17,13 +18,14 @@ import re import signal import struct import sys +import time import zipfile import numpy as np from abc import ABCMeta, abstractmethod from dataclasses import dataclass from pathlib import Path -from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union) +from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, TypeVar, Union) from sentencepiece import SentencePieceProcessor # type: ignore if TYPE_CHECKING: @@ -37,30 +39,70 @@ NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' ARCH=gguf.MODEL_ARCH.LLAMA NAMES=gguf.MODEL_TENSOR_NAMES[ARCH] +DEFAULT_CONCURRENCY = 8 # # data types # @dataclass(frozen=True) -class UnquantizedDataType: +class DataType: name: str + dtype: 'np.dtype[Any]' + valid_conversions: List[str] -DT_F16 = UnquantizedDataType('F16') -DT_F32 = UnquantizedDataType('F32') -DT_I32 = UnquantizedDataType('I32') -DT_BF16 = UnquantizedDataType('BF16') + def elements_to_bytes(self, n_elements: int) -> int: + return n_elements * self.dtype.itemsize -DataType = Union[UnquantizedDataType] +@dataclass(frozen=True) +class UnquantizedDataType(DataType): + pass -DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = { - DT_BF16: np.dtype(np.uint16), - DT_F16: np.dtype(np.float16), - DT_F32: np.dtype(np.float32), - DT_I32: np.dtype(np.int32), -} +DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0']) +DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) +DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) +DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) -NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \ - {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()} +@dataclass(frozen=True) +class QuantizedDataType(DataType): + block_size: int + quantized_dtype: 'np.dtype[Any]' + ggml_type: gguf.GGMLQuantizationType + + def quantize(self, arr: NDArray) -> NDArray: + raise NotImplementedError(f'Quantization for {self.name} not implemented') + + def elements_to_bytes(self, n_elements: int) -> int: + assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}' + return self.quantized_dtype.itemsize * (n_elements // self.block_size) + +@dataclass(frozen=True) +class Q8_0QuantizedDataType(QuantizedDataType): + # Mini Q8_0 quantization in Python! + def quantize(self, arr: NDArray) -> NDArray: + assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}' + assert arr.dtype == np.float32, f'Bad array type {arr.dtype}' + n_blocks = arr.size // self.block_size + blocks = arr.reshape((n_blocks, self.block_size)) + # Much faster implementation of block quantization contributed by @Cebtenzzre + def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[Tuple[Any, Any]]: + d = abs(blocks).max(axis = 1) / np.float32(127) + with np.errstate(divide = 'ignore'): + qs = (blocks / d[:, None]).round() + qs[d == 0] = 0 + yield from zip(d, qs) + return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype) + +DT_Q8_0 = Q8_0QuantizedDataType('Q8_0', + dtype = np.dtype(np.float32), valid_conversions = [], + ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32, + quantized_dtype = np.dtype([('d', ' DataType: - if len(tensor.shape) == 1: - # 1D tensors are always F32. - return DT_F32 - elif self == GGMLFileType.AllF32: - return DT_F32 - elif self == GGMLFileType.MostlyF16: - return DT_F16 - else: + dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self) + if dt is None: raise ValueError(self) + # 1D tensors are always F32. + return dt if len(tensor.shape) > 1 else DT_F32 +GGML_FILE_TYPE_TO_DATA_TYPE: Dict[GGMLFileType, DataType] = { + GGMLFileType.AllF32 : DT_F32, + GGMLFileType.MostlyF16 : DT_F16, + GGMLFileType.MostlyQ8_0: DT_Q8_0, +} # # hparams loading @@ -105,6 +149,7 @@ class Params: f_norm_eps: float f_rope_freq_base: Optional[float] = None + f_rope_scale: Optional[float] = None ftype: Optional[GGMLFileType] = None @@ -160,13 +205,20 @@ class Params: def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params': config = json.load(open(config_path)) - n_vocab = config["vocab_size"] - n_embd = config["hidden_size"] - n_layer = config["num_hidden_layers"] - n_ff = config["intermediate_size"] - n_head = config["num_attention_heads"] - n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head - f_norm_eps = config["rms_norm_eps"] + n_vocab = config["vocab_size"] + n_embd = config["hidden_size"] + n_layer = config["num_hidden_layers"] + n_ff = config["intermediate_size"] + n_head = config["num_attention_heads"] + n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head + f_norm_eps = config["rms_norm_eps"] + f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None + + rope_scaling = config.get("rope_scaling") + if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear": + f_rope_scale = config["rope_scaling"].get("factor") + else: + f_rope_scale = None n_mult = Params.find_n_mult(n_ff, n_embd) @@ -179,15 +231,17 @@ class Params: "Suggestion: provide 'config.json' of the model in the same directory containing model files.") return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_mult = n_mult, - n_layer = n_layer, - n_ctx = n_ctx, - n_ff = n_ff, - n_head = n_head, - n_head_kv = n_head_kv, - f_norm_eps = f_norm_eps, + n_vocab = n_vocab, + n_embd = n_embd, + n_mult = n_mult, + n_layer = n_layer, + n_ctx = n_ctx, + n_ff = n_ff, + n_head = n_head, + n_head_kv = n_head_kv, + f_norm_eps = f_norm_eps, + f_rope_freq_base = f_rope_freq_base, + f_rope_scale = f_rope_scale, ) # LLaMA v2 70B params.json @@ -405,7 +459,7 @@ class UnquantizedTensor(Tensor): self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype] def astype(self, data_type: DataType) -> Tensor: - dtype = DATA_TYPE_TO_NUMPY[data_type] + dtype = data_type.dtype if self.data_type == DT_BF16: self.ndarray = bf16_to_fp32(self.ndarray) return UnquantizedTensor(self.ndarray.astype(dtype)) @@ -444,22 +498,6 @@ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, conv GGMLCompatibleTensor = Union[UnquantizedTensor] -class DeferredPermutedTensor(Tensor): - def __init__(self, base: Tensor, n_head: int, n_head_kv: int) -> None: - self.base = base - self.n_head = n_head - self.data_type = self.base.data_type - - def astype(self, data_type: DataType) -> Tensor: - return self.base.astype(data_type).permute(self.n_head, self.n_head_kv) - - def to_ggml(self) -> GGMLCompatibleTensor: - return self.base.to_ggml().permute(self.n_head, self.n_head_kv) - - def permute(self, n_head: int, n_head_kv: int) -> Tensor: - raise Exception("shouldn't permute twice") - - @dataclass class LazyTensor: _load: Callable[[], Tensor] @@ -469,7 +507,9 @@ class LazyTensor: def load(self) -> Tensor: ret = self._load() - assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description) + # Should be okay if it maps to the same numpy type? + assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \ + (self.data_type, ret.data_type, self.description) return ret def astype(self, data_type: DataType) -> 'LazyTensor': @@ -480,8 +520,8 @@ class LazyTensor: return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}') def validate_conversion_to(self, data_type: DataType) -> None: - if data_type == self.data_type: - return + if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions: + raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.') LazyModel = Dict[str, LazyTensor] @@ -607,9 +647,7 @@ class LazyUnpickler(pickle.Unpickler): info = self.zip_file.getinfo(filename) def load(offset: int, elm_count: int) -> NDArray: - dtype = DATA_TYPE_TO_NUMPY.get(data_type) - if dtype is None: - raise Exception("tensor stored in unsupported format") + dtype = data_type.dtype fp = self.zip_file.open(info) fp.seek(offset * dtype.itemsize) size = elm_count * dtype.itemsize @@ -673,7 +711,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: def convert(info: Dict[str, Any]) -> LazyTensor: data_type = SAFETENSORS_DATA_TYPES[info['dtype']] - numpy_dtype = DATA_TYPE_TO_NUMPY[data_type] + numpy_dtype = data_type.dtype shape: List[int] = info['shape'] begin, end = info['data_offsets'] assert 0 <= begin <= end <= len(byte_buf) @@ -713,23 +751,35 @@ def lazy_load_file(path: Path) -> ModelPlus: In = TypeVar('In') Out = TypeVar('Out') -def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]: +def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, factory: Callable = ThreadPoolExecutor) -> Iterable[Out]: '''Parallel map, but with backpressure. If the caller doesn't call `next` fast enough, this will stop calling `func` at some point rather than letting results pile up in memory. Specifically, there is a max of one output value buffered per thread.''' - with concurrent.futures.ThreadPoolExecutor() as executor: + if concurrency < 2: + yield from map(func, iterable) + # Not reached. + iterable = iter(iterable) + with factory(max_workers = max_workers) as executor: futures: List[concurrent.futures.Future[Out]] = [] - items_rev = list(iterable)[::-1] - for i in range(min(concurrency, len(items_rev))): - futures.append(executor.submit(func, items_rev.pop())) + done = False + for _ in range(concurrency): + try: + futures.append(executor.submit(func, next(iterable))) + except StopIteration: + done = True + break + while futures: result = futures.pop(0).result() - if items_rev: - futures.append(executor.submit(func, items_rev.pop())) + while not done and len(futures) < concurrency: + try: + futures.append(executor.submit(func, next(iterable))) + except StopIteration: + done = True + break yield result - def check_vocab_size(params: Params, vocab: Vocab) -> None: if params.n_vocab != vocab.vocab_size: assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab) @@ -771,6 +821,9 @@ class OutputFile: if params.f_rope_freq_base: self.gguf.add_rope_freq_base(params.f_rope_freq_base) + if params.f_rope_scale: + self.gguf.add_rope_scale_linear(params.f_rope_scale) + if params.ftype: self.gguf.add_file_type(params.ftype) @@ -791,12 +844,11 @@ class OutputFile: self.gguf.add_token_types(toktypes) def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: - n_elements = 1 - for dim in tensor.shape: - n_elements *= dim - data_type = DATA_TYPE_TO_NUMPY[tensor.data_type] - data_nbytes = n_elements * data_type.itemsize - self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes) + n_elements = int(np.prod(tensor.shape)) + raw_dtype = getattr(tensor.data_type, 'ggml_type', None) + data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype + data_nbytes = tensor.data_type.elements_to_bytes(n_elements) + self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype) def write_meta(self) -> None: self.gguf.write_header_to_file() @@ -822,7 +874,20 @@ class OutputFile: of.close() @staticmethod - def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None: + def do_item(item: Tuple[str, LazyTensor]) -> Tuple[DataType, NDArray]: + name, lazy_tensor = item + tensor = lazy_tensor.load().to_ggml() + return (lazy_tensor.data_type, tensor.ndarray) + + @staticmethod + def maybe_do_quantize(item: Tuple[DataType, NDArray]) -> NDArray: + dt, arr = item + if not isinstance(dt, QuantizedDataType): + return arr + return dt.quantize(arr) + + @staticmethod + def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, concurrency: int = DEFAULT_CONCURRENCY) -> None: check_vocab_size(params, vocab) of = OutputFile(fname_out) @@ -838,16 +903,19 @@ class OutputFile: of.write_meta() of.write_tensor_info() - def do_item(item: Tuple[str, LazyTensor]) -> NDArray: - name, lazy_tensor = item - return lazy_tensor.load().to_ggml().ndarray - # tensor data - ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8) + ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) + if ftype == GGMLFileType.MostlyQ8_0: + ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, factory = ProcessPoolExecutor) + else: + ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) + + start = time.time() for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): + elapsed = time.time() - start size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) padi = len(str(len(model))) - print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}") + print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}") of.gguf.write_tensor_data(ndarray) of.close() @@ -859,6 +927,8 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi return GGMLFileType.AllF32 if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)): return GGMLFileType.MostlyF16 + if output_type_str == "q8_0": + return GGMLFileType.MostlyQ8_0 name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()} @@ -905,7 +975,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: print(f"skipping tensor {name_new}") continue else: - print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type} | {lazy_tensor.shape}") + print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") out[name_new] = lazy_tensor return out @@ -1010,6 +1080,7 @@ def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path: namestr = { GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", + GGMLFileType.MostlyQ8_0:"q8_0", }[file_type] ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" if ret in model_paths: @@ -1033,12 +1104,13 @@ def main(args_in: Optional[List[str]] = None) -> None: parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") + parser.add_argument("--outtype", choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") + parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) args = parser.parse_args(args_in) if args.dump_single: @@ -1060,6 +1132,7 @@ def main(args_in: Optional[List[str]] = None) -> None: params.ftype = { "f32": GGMLFileType.AllF32, "f16": GGMLFileType.MostlyF16, + "q8_0": GGMLFileType.MostlyQ8_0, }[args.outtype] print(f"params = {params}") @@ -1091,7 +1164,7 @@ def main(args_in: Optional[List[str]] = None) -> None: params.ftype = ftype print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, params, model, vocab) + OutputFile.write_all(outfile, ftype, params, model, vocab, concurrency = args.concurrency) print(f"Wrote {outfile}") diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d2176c910..94b785224 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -25,6 +25,7 @@ else() add_subdirectory(simple) add_subdirectory(embd-input) add_subdirectory(llama-bench) + add_subdirectory(beam_search) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/beam_search/CMakeLists.txt b/examples/beam_search/CMakeLists.txt new file mode 100644 index 000000000..b29e01092 --- /dev/null +++ b/examples/beam_search/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET beam_search) +add_executable(${TARGET} beam_search.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/beam_search/beam_search.cpp b/examples/beam_search/beam_search.cpp new file mode 100644 index 000000000..42c7c7254 --- /dev/null +++ b/examples/beam_search/beam_search.cpp @@ -0,0 +1,188 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include "common.h" +#include "llama.h" +#include "build-info.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#include +#endif + +// Used for debugging to print out beam tokens. +struct ostream_beam_view { + llama_context * ctx; + llama_beam_view beam_view; +}; +std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) { + os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; + for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { + os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); + } + return os << ')'; +} + +// Put here anything you want back in beam_search_callback(). +struct beam_search_callback_data { + llama_context * ctx; + std::vector response; +}; + +// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. +// For example, eob can be flagged due to maximum token length, stop words, etc. +bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) { + return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); +} + +// Function matching type llama_beam_search_callback_fn_t. +// Custom callback example is called each time the beams lengths increase: +// * Show progress by printing ',' following by number of convergent beam tokens if any. +// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. +// This is also called when the stop condition is met. +// Collect tokens into std::vector response which is pointed to by callback_data. +void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { + auto& callback_data = *static_cast(callback_data_ptr); + // Mark beams as EOS as needed. + for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { + llama_beam_view& beam_view = beams_state.beam_views[i]; + if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) { + beam_view.eob = true; + } + } + printf(","); // Show progress + if (const size_t n = beams_state.common_prefix_length) { + callback_data.response.resize(callback_data.response.size() + n); + assert(0u < beams_state.n_beams); + const llama_token * tokens = beams_state.beam_views[0].tokens; + std::copy(tokens, tokens + n, callback_data.response.end() - n); + printf("%lu", n); + } + fflush(stdout); +#if 1 // DEBUG: print current beams for this iteration + std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n"; + for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { + std::cout << "beams["< 3 ) + { + params.prompt = argv[3]; + } + + if ( params.prompt.empty() ) + { + params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n"; + } + + //--------------------------------- + // Init LLM : + //--------------------------------- + + llama_backend_init(params.numa); + + llama_model * model; + llama_context * ctx; + + std::tie(model, ctx) = llama_init_from_gpt_params( params ); + + if ( model == NULL ) + { + fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); + return 1; + } + + //--------------------------------- + // Tokenize the prompt : + //--------------------------------- + + std::vector tokens_list = llama_tokenize(ctx, params.prompt, true); + + const size_t max_context_size = llama_n_ctx( ctx ); + const size_t max_tokens_list_size = max_context_size - 4 ; + + if (tokens_list.size() > max_tokens_list_size) + { + fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" , + __func__ , tokens_list.size() , max_tokens_list_size ); + return 1; + } + + fprintf( stderr, "\n\n" ); + + // Print the tokens from the prompt : + + for( auto id : tokens_list ) + { + std::cout << llama_token_to_piece(ctx, id); + } + std::cout << std::flush; + + int n_past = llama_get_kv_cache_token_count(ctx); + if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads)) + { + fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ ); + return 1; + } + n_past += tokens_list.size(); + + beam_search_callback_data callback_data{ctx, {}}; + size_t const beam_width = static_cast(params.n_beams); + int const n_predict = 256; + llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads); + + std::cout << "\n\n"; + for (llama_token const token_id : callback_data.response) { + std::cout << llama_token_to_piece(ctx,token_id); + } + std::cout << std::endl; + + llama_free( ctx ); + llama_free_model( model ); + + llama_backend_free(); + + return 0; +} diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md index fd561fcbc..0f37d295b 100644 --- a/examples/convert-llama2c-to-ggml/README.md +++ b/examples/convert-llama2c-to-ggml/README.md @@ -12,18 +12,14 @@ usage: ./convert-llama2c-to-ggml [options] options: -h, --help show this help message and exit - --copy-vocab-from-model FNAME model path from which to copy vocab (default 'tokenizer.bin') + --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf') --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model --llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin') ``` An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows: -`$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin` - -For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually: - -`$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin` +`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` Now you can use the model with a command like: diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index f8a58dc7a..51d90ea6a 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -10,9 +10,48 @@ #include #include #include +#include #include #include +// GGUF keys & tensor names. + +#define KV_GENERAL_ARCHITECTURE "general.architecture" +#define KV_GENERAL_NAME "general.name" + +#define KV_TOKENIZER_MODEL "tokenizer.ggml.model" +#define KV_TOKENIZER_LIST "tokenizer.ggml.tokens" +#define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type" +#define KV_TOKENIZER_SCORES "tokenizer.ggml.scores" +#define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id" +#define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id" +#define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id" +#define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id" +#define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id" +#define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json" + +#define KV_CONTEXT_LENGTH "llama.context_length" +#define KV_EMBEDDING_LENGTH "llama.embedding_length" +#define KV_BLOCK_COUNT "llama.block_count" +#define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length" +#define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count" +#define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv" +#define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon" +#define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count" + +#define TN_TOKEN_EMBD "token_embd.weight" +#define TN_OUTPUT_NORM "output_norm.weight" +#define TN_OUTPUT "output.weight" +#define TN_ATTN_NORM "blk.%d.attn_norm.weight" +#define TN_ATTN_Q "blk.%d.attn_q.weight" +#define TN_ATTN_K "blk.%d.attn_k.weight" +#define TN_ATTN_V "blk.%d.attn_v.weight" +#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight" +#define TN_FFN_NORM "blk.%d.ffn_norm.weight" +#define TN_FFN_GATE "blk.%d.ffn_gate.weight" +#define TN_FFN_DOWN "blk.%d.ffn_down.weight" +#define TN_FFN_UP "blk.%d.ffn_up.weight" + #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif @@ -20,6 +59,11 @@ #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' #define LLAMA_FILE_VERSION_GGJT_V3 3 +#define TOKENIZER_NAME "llama" +#define UNKNOWN_TOKEN_ID 0 +#define BOS_TOKEN_ID 1 +#define EOS_TOKEN_ID 2 + //////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc. typedef struct { int dim; // transformer dimension @@ -183,6 +227,7 @@ struct my_llama_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; // this is provided as user input? uint32_t n_embd = 4096; + uint32_t n_ff = 11008; uint32_t n_mult = 4; uint32_t n_head = 32; uint32_t n_layer = 32; @@ -214,6 +259,8 @@ struct my_llama_layer { struct my_llama_model { struct ggml_context * ctx = NULL; + std::string name; + my_llama_hparams hparams; struct ggml_tensor * tok_embeddings; @@ -276,18 +323,13 @@ struct train_params { int mem_compute1_gb; }; -uint32_t get_n_ff(const struct my_llama_hparams* hparams) { - const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; - return n_ff; -} - void print_params(struct my_llama_hparams * params) { printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_embd: %d\n", __func__, params->n_embd); printf("%s: n_mult: %d\n", __func__, params->n_mult); printf("%s: n_head: %d\n", __func__, params->n_head); - printf("%s: n_ff: %d\n", __func__, get_n_ff(params)); + printf("%s: n_ff: %d\n", __func__, params->n_ff); printf("%s: n_layer: %d\n", __func__, params->n_layer); printf("%s: n_rot: %d\n", __func__, params->n_rot); } @@ -299,7 +341,7 @@ void init_model(struct my_llama_model * model) { const uint32_t n_layer = hparams.n_layer; const uint32_t n_vocab = hparams.n_vocab; - const uint32_t n_ff = get_n_ff(&hparams); + const uint32_t n_ff = hparams.n_ff; struct ggml_context * ctx = model->ctx; model->train_its = 0; @@ -481,21 +523,6 @@ struct llama_file { return std::string(chars.data(), len); } - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - throw std::runtime_error(format("write error: %s", strerror(errno))); - } - } - - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); - } - ~llama_file() { if (fp) { std::fclose(fp); @@ -503,30 +530,6 @@ struct llama_file { } }; -void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { - if (tensor == NULL) { - file->write_u32(0); - file->write_u32(0); - file->write_u32(GGML_TYPE_F32); - file->seek((0-file->tell()) & 31, SEEK_CUR); - return; - } - const char * name = ggml_get_name(tensor); - uint32_t name_len = strlen(name); - uint32_t nd = tensor->n_dims; - uint32_t ne[4] = { (uint32_t)tensor->ne[0], - (uint32_t)tensor->ne[1], - (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - file->write_u32(nd); - file->write_u32(name_len); - file->write_u32(tensor->type); - file->write_raw(ne, sizeof(ne[0]) * nd); - file->write_raw(name, name_len); - file->seek((0-file->tell()) & 31, SEEK_CUR); - file->write_raw(tensor->data, ggml_nbytes(tensor)); -} - bool is_ggml_file(const char *filename) { llama_file file(filename, "rb"); if (file.size < 4) { @@ -536,48 +539,96 @@ bool is_ggml_file(const char *filename) { return magic == GGUF_MAGIC; } +static std::string llama_escape_whitespaces(const std::string& text) { + std::ostringstream out; + for (char c : text) { + if (c == ' ') out << "\xe2\x96\x81"; + else out << c; + } + return out.str(); +} + void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { -#pragma message("TODO: implement reading vocabulary using gguf") -// // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary -// if (is_ggml_file(filename)) { -// -// struct llama_context_params llama_params = llama_context_default_params(); -// llama_params.vocab_only = true; -// -// struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params); -// struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); -// -// const int n_vocab = llama_n_vocab(lctx); -// vocab->id_to_token.resize(n_vocab); -// for (int i=0; iid_to_token[i].text = llama_token_get_text(lctx, i); -// vocab->id_to_token[i].score = llama_token_get_score(lctx, i); -// vocab->id_to_token[i].type = llama_token_get_type(lctx, i); -// vocab->token_to_id.emplace(vocab->id_to_token[i].text, i); -// } -// llama_free(lctx); -// llama_free_model(lmodel); -// } else - { // assume llama2.c vocabulary - printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename); + if (is_ggml_file(filename)) { + struct ggml_context * ctx_data = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + }; + + struct gguf_context * ctx = gguf_init_from_file(filename, params); + GGML_ASSERT(ctx != NULL); + + const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL); + GGML_ASSERT(model_idx >= 0); + std::string tokenizer_name = gguf_get_val_str(ctx, model_idx); + GGML_ASSERT(tokenizer_name == TOKENIZER_NAME); + + const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST); + GGML_ASSERT(token_idx >= 0); + + const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES); + GGML_ASSERT(score_idx >= 0); + const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); + + const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE); + GGML_ASSERT(toktype_idx >= 0); + const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); + + const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); + + vocab->id_to_token.resize(n_vocab); + + for (uint32_t i = 0; i < n_vocab; i++) { + std::string word = gguf_get_arr_str(ctx, token_idx, i); + + vocab->token_to_id[word] = i; + + auto & token_data = vocab->id_to_token[i]; + token_data.text = std::move(word); + token_data.score = scores[i]; + token_data.type = (llama_token_type) toktypes[i]; + } + ggml_free(ctx_data); + gguf_free(ctx); + } else { + // assume llama2.c vocabulary + printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); llama_file file(filename, "rb"); const int n_vocab = config->vocab_size; /* uint32_t max_token_length = */ file.read_u32(); // unused vocab->id_to_token.resize(n_vocab); - for (int i=0; i single byte tokens. - char byte_val; - if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) { - char cstr[2] = { byte_val, 0 }; - text = cstr; + + unsigned char byte_val; + llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL; + if (id == UNKNOWN_TOKEN_ID) { + text = ""; + type = LLAMA_TOKEN_TYPE_UNKNOWN; + } else if (id == BOS_TOKEN_ID) { + text = ""; + type = LLAMA_TOKEN_TYPE_CONTROL; + } else if (id == EOS_TOKEN_ID) { + text = ""; + type = LLAMA_TOKEN_TYPE_CONTROL; + } else if (text.empty()) { + type = LLAMA_TOKEN_TYPE_CONTROL; + } else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) { + // Text of byte tokens is already in the expected format. + type = LLAMA_TOKEN_TYPE_BYTE; + } else { + type = LLAMA_TOKEN_TYPE_NORMAL; } - vocab->id_to_token[i].text = text; - vocab->id_to_token[i].score = score; - vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED; - vocab->token_to_id.emplace(text, i); + text = llama_escape_whitespaces(text); + + vocab->id_to_token[id].text = text; + vocab->id_to_token[id].score = score; + vocab->id_to_token[id].type = type; + vocab->token_to_id.emplace(text, id); } } } @@ -619,33 +670,6 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar } void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) { - struct llama_file file(filename, "wb"); - if (file.fp == NULL) { - return; - } - -#pragma message("TODO: implement file saving using gguf") - // write_magic - file.write_u32(LLAMA_FILE_MAGIC_GGJT); // magic - file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version - // write_hparams - file.write_u32(model->hparams.n_vocab); - file.write_u32(model->hparams.n_embd); - file.write_u32(model->hparams.n_mult); - file.write_u32(model->hparams.n_head); - file.write_u32(model->hparams.n_layer); - file.write_u32(model->hparams.n_rot); - file.write_u32(LLAMA_FTYPE_ALL_F32); - - // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk. - uint32_t n_vocab = model->hparams.n_vocab; - for (uint32_t i = 0; i < n_vocab; i++) { - const auto & token_data = vocab->id_to_token.at(i); - file.write_u32((uint32_t) token_data.text.size()); - file.write_raw(token_data.text.data(), token_data.text.size()); - file.write_raw(&token_data.score, sizeof(token_data.score)); - } - // stuff AK weights into GG weights one by one. // w->token_embedding_table -> model->tok_embeddings // float* -> struct ggml_tensor @@ -658,8 +682,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod // for rms-att-weight int row_length = model->hparams.n_embd; const auto & hparams = model->hparams; - //int n_ff = model->hparams.n_embd; - int n_ff = get_n_ff(&hparams); + int n_ff = model->hparams.n_ff; for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ auto & layer = model->layers[i]; @@ -677,28 +700,91 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]); stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]); } + + struct gguf_context * ctx = gguf_init_empty(); + + std::vector tokens; + std::vector scores; + std::vector token_types; + for (const llama_vocab::token_data & token_data : vocab->id_to_token) { + tokens.push_back(token_data.text.c_str()); + scores.push_back(token_data.score); + token_types.push_back(token_data.type); + } + gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size()); + gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size()); + gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size()); + + gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME); + + gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama"); + gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama"); + + // special tokens + gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID); + gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID); + gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID); + gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1); + gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1); + + gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx); + gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); + gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff); + gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head); + // n_head_kv is optional, default to n_head + // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...); + gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer); + gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot); + gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f); + // write tensors - write_tensor(&file, model->tok_embeddings); - write_tensor(&file, model->norm); - write_tensor(&file, model->output); // ? + ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD); + gguf_add_tensor(ctx, model->tok_embeddings); + + ggml_set_name(model->norm, TN_OUTPUT_NORM); + gguf_add_tensor(ctx, model->norm); + + ggml_set_name(model->output, TN_OUTPUT); + gguf_add_tensor(ctx, model->output); + for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { auto & layer = model->layers[i]; - write_tensor(&file, layer.attention_norm); - write_tensor(&file, layer.wq); - write_tensor(&file, layer.wk); - write_tensor(&file, layer.wv); - write_tensor(&file, layer.wo); - write_tensor(&file, layer.ffn_norm); - write_tensor(&file, layer.w1); - write_tensor(&file, layer.w2); - write_tensor(&file, layer.w3); + ggml_format_name(layer.wq, TN_ATTN_Q, i); + gguf_add_tensor(ctx, layer.wq); + + ggml_format_name(layer.wk, TN_ATTN_K, i); + gguf_add_tensor(ctx, layer.wk); + + ggml_format_name(layer.wv, TN_ATTN_V, i); + gguf_add_tensor(ctx, layer.wv); + + ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i); + gguf_add_tensor(ctx, layer.wo); + + ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i); + gguf_add_tensor(ctx, layer.attention_norm); + + ggml_format_name(layer.w1, TN_FFN_GATE, i); + gguf_add_tensor(ctx, layer.w1); + + ggml_format_name(layer.w2, TN_FFN_DOWN, i); + gguf_add_tensor(ctx, layer.w2); + + ggml_format_name(layer.w3, TN_FFN_UP, i); + gguf_add_tensor(ctx, layer.w3); + + ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i); + gguf_add_tensor(ctx, layer.ffn_norm); } + + gguf_write_to_file(ctx, filename, false); + gguf_free(ctx); } struct train_params get_default_train_params() { struct train_params params; - params.fn_vocab_model = "tokenizer.bin"; + params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; params.fn_llama2c_output_model = "ak_llama_model.bin"; params.fn_train_data = "shakespeare.txt"; params.fn_checkpoint_in = "checkpoint.bin"; @@ -751,7 +837,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params) fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model); + fprintf(stderr, " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model); fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n"); fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model); fprintf(stderr, "\n"); @@ -812,6 +898,14 @@ bool params_parse(int argc, char ** argv, struct train_params * params) { return true; } +std::string basename(const std::string &path) { + size_t pos = path.find_last_of("/"); + if (pos == std::string::npos) { + return path; + } + return path.substr(pos + 1); +} + int main(int argc, char ** argv) { struct train_params params = get_default_train_params(); if (!params_parse(argc, argv, ¶ms)) { @@ -840,6 +934,7 @@ int main(int argc, char ** argv) { model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); model.hparams.n_ctx = params.n_ctx; model.hparams.n_embd = config.dim; //params.n_embd; + model.hparams.n_ff = config.hidden_dim; model.hparams.n_mult = 32;//params.n_mult; model.hparams.n_head = config.n_heads; //params.n_head; model.hparams.n_layer = config.n_layers; //params.n_layer; @@ -853,6 +948,7 @@ int main(int argc, char ** argv) { model.ctx = ggml_init(lcparams); init_model(&model); + model.name = basename(params.fn_llama2c_model); save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model); diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp index 8a6ad882e..036bdb398 100644 --- a/examples/embd-input/embd-input-lib.cpp +++ b/examples/embd-input/embd-input-lib.cpp @@ -214,7 +214,7 @@ const char * sampling(struct MyModel * mymodel) { if (id == llama_token_eos(ctx)) { ret = ""; } else { - ret = llama_token_to_str(ctx, id); + ret = llama_token_to_piece(ctx, id); } eval_id(mymodel, id); return ret.c_str(); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 38395c75b..93d583b5c 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -56,9 +56,6 @@ int main(int argc, char ** argv) { int n_past = 0; - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); - // tokenize the prompt auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); @@ -67,7 +64,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } fprintf(stderr, "\n"); } diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index dee00df87..cda517bde 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -30,6 +30,9 @@ bool gguf_ex_write(const std::string & fname) { gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678); gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679); gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f); + gguf_set_val_u64 (ctx, "some.parameter.uint64", 0x123456789abcdef0ull); + gguf_set_val_i64 (ctx, "some.parameter.int64", -0x123456789abcdef1ll); + gguf_set_val_f64 (ctx, "some.parameter.float64", 0.1234567890123456789); gguf_set_val_bool(ctx, "some.parameter.bool", true); gguf_set_val_str (ctx, "some.parameter.string", "hello world"); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 7a2811584..d0fe6d90d 100755 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -441,6 +441,8 @@ struct test { static const std::string gpu_info; std::string model_filename; std::string model_type; + uint64_t model_size; + uint64_t model_n_params; int n_batch; int n_threads; bool f32_kv; @@ -457,8 +459,10 @@ struct test { test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) { model_filename = inst.model; char buf[128]; - llama_model_type(lmodel, buf, sizeof(buf)); + llama_model_desc(lmodel, buf, sizeof(buf)); model_type = buf; + model_size = llama_model_size(lmodel); + model_n_params = llama_model_n_params(lmodel); n_batch = inst.n_batch; n_threads = inst.n_threads; f32_kv = inst.f32_kv; @@ -524,7 +528,7 @@ struct test { "build_commit", "build_number", "cuda", "opencl", "metal", "gpu_blas", "blas", "cpu_info", "gpu_info", - "model_filename", "model_type", + "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "f16_kv", "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split", "n_prompt", "n_gen", "test_time", @@ -538,6 +542,7 @@ struct test { static field_type get_field_type(const std::string & field) { if (field == "build_number" || field == "n_batch" || field == "n_threads" || + field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" || field == "stddev_ns") { @@ -573,7 +578,7 @@ struct test { build_commit, std::to_string(build_number), std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), cpu_info, gpu_info, - model_filename, model_type, + model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv), std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str, std::to_string(n_prompt), std::to_string(n_gen), test_time, @@ -709,8 +714,15 @@ struct markdown_printer : public printer { return -30; } if (field == "t/s") { - return 15; + return 16; } + if (field == "size" || field == "params") { + return 10; + } + if (field == "n_gpu_layers") { + return 3; + } + int width = std::max((int)field.length(), 10); if (test::get_field_type(field) == test::STRING) { @@ -719,9 +731,28 @@ struct markdown_printer : public printer { return width; } + static std::string get_field_display_name(const std::string & field) { + if (field == "n_gpu_layers") { + return "ngl"; + } + if (field == "n_threads") { + return "threads"; + } + if (field == "mul_mat_q") { + return "mmq"; + } + if (field == "tensor_split") { + return "ts"; + } + return field; + } + void print_header(const cmd_params & params) override { // select fields to print - fields = { "model", "backend" }; + fields.push_back("model"); + fields.push_back("size"); + fields.push_back("params"); + fields.push_back("backend"); bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS"; if (!is_cpu_backend) { fields.push_back("n_gpu_layers"); @@ -752,7 +783,7 @@ struct markdown_printer : public printer { fprintf(fout, "|"); for (const auto & field : fields) { - fprintf(fout, " %*s |", get_field_width(field), field.c_str()); + fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str()); } fprintf(fout, "\n"); fprintf(fout, "|"); @@ -769,12 +800,26 @@ struct markdown_printer : public printer { fprintf(fout, "|"); for (const auto & field : fields) { std::string value; + char buf[128]; if (field == "model") { value = t.model_type; + } else if (field == "size") { + if (t.model_size < 1024*1024*1024) { + snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0); + } else { + snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0); + } + value = buf; + } else if (field == "params") { + if (t.model_n_params < 1000*1000*1000) { + snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6); + } else { + snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9); + } + value = buf; } else if (field == "backend") { value = test::get_backend(); } else if (field == "test") { - char buf[128]; if (t.n_prompt > 0 && t.n_gen == 0) { snprintf(buf, sizeof(buf), "pp %d", t.n_prompt); } else if (t.n_gen > 0 && t.n_prompt == 0) { @@ -785,7 +830,6 @@ struct markdown_printer : public printer { } value = buf; } else if (field == "t/s") { - char buf[128]; snprintf(buf, sizeof(buf), "%.2f Β± %.2f", t.avg_ts(), t.stdev_ts()); value = buf; } else if (vmap.find(field) != vmap.end()) { diff --git a/examples/main/main.cpp b/examples/main/main.cpp index cb8747c2b..3ce57f436 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -189,12 +189,14 @@ int main(int argc, char ** argv) { } } - const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + // Add BOS if SPM tokenizer + const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; // tokenize the prompt std::vector embd_inp; + if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { - embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm); + embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos); } else { embd_inp = session_tokens; } @@ -209,10 +211,9 @@ int main(int argc, char ** argv) { int guidance_offset = 0; int original_prompt_len = 0; if (ctx_guidance) { - params.cfg_negative_prompt.insert(0, 1, ' '); - guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm); + guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos); - std::vector original_inp = ::llama_tokenize(ctx, params.prompt, is_spm); + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); original_prompt_len = original_inp.size(); guidance_offset = (int)guidance_inp.size() - original_prompt_len; } @@ -259,7 +260,7 @@ int main(int argc, char ** argv) { } // prefix & suffix for instruct mode - const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm); + const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos); const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); // in instruct mode, we inject a prefix and a suffix to each input by the user @@ -278,7 +279,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } if (ctx_guidance) { @@ -286,14 +287,14 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); for (int i = 0; i < (int) guidance_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); } } if (params.n_keep > 0) { fprintf(stderr, "%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str()); + fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } fprintf(stderr, "'\n"); } @@ -449,7 +450,7 @@ int main(int argc, char ** argv) { //printf("\n---\n"); //printf("resetting: '"); //for (int i = 0; i < (int) embd.size(); i++) { - // printf("%s", llama_token_to_str(ctx, embd[i])); + // printf("%s", llama_token_to_piece(ctx, embd[i])); //} //printf("'\n"); //printf("\n---\n"); @@ -502,7 +503,7 @@ int main(int argc, char ** argv) { input_size = embd_guidance.size(); //fprintf(stderr, "\n---------------------\n"); //for (int i = 0; i < (int) embd_guidance.size(); i++) { - //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i])); + //fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i])); //} //fprintf(stderr, "\n---------------------\n"); } else { @@ -597,7 +598,12 @@ int main(int argc, char ** argv) { last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, last_n_repeat, alpha_frequency, alpha_presence); if (!penalize_nl) { - logits[llama_token_nl(ctx)] = nl_logit; + for (size_t idx = 0; idx < candidates_p.size; idx++) { + if (candidates_p.data[idx].id == llama_token_nl(ctx)) { + candidates_p.data[idx].logit = nl_logit; + break; + } + } } if (grammar != NULL) { @@ -661,7 +667,7 @@ int main(int argc, char ** argv) { // display text if (input_echo) { for (auto id : embd) { - printf("%s", llama_token_to_str(ctx, id).c_str()); + printf("%s", llama_token_to_piece(ctx, id).c_str()); } fflush(stdout); } @@ -677,7 +683,7 @@ int main(int argc, char ** argv) { if (params.antiprompt.size()) { std::string last_output; for (auto id : last_n_tokens) { - last_output += llama_token_to_str(ctx, id); + last_output += llama_token_to_piece(ctx, id); } is_antiprompt = false; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index a7bd9db2a..ebafa0c29 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -27,6 +29,40 @@ std::vector softmax(const std::vector& logits) { return probs; } +float log_softmax(int n_vocab, const float * logits, int tok) { + float max_logit = logits[0]; + for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]); + double sum_exp = 0.0; + for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit); + return logits[tok] - max_logit - log(sum_exp); +} + +void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector& workers, + double& nll, double& nll2) { + + std::mutex mutex; + int counter = 0; + auto compute = [&mutex, &counter, &nll, &nll2, n_vocab, logits, tokens, n_token] () { + double local_nll = 0, local_nll2 = 0; + while (true) { + std::unique_lock lock(mutex); + int i = counter++; + if (i >= n_token) { + nll += local_nll; nll2 += local_nll2; + break; + } + lock.unlock(); + double v = -log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); + local_nll += v; + local_nll2 += v*v; + } + }; + for (auto& w : workers) w = std::thread(compute); + compute(); + for (auto& w : workers) w.join(); + +} + void perplexity_v2(llama_context * ctx, const gpt_params & params) { // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` @@ -154,10 +190,14 @@ void perplexity(llama_context * ctx, const gpt_params & params) { const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; const bool add_bos = is_spm; + auto tim1 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenizing the input ..\n", __func__); auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + auto tim2 = std::chrono::high_resolution_clock::now(); + fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + const int n_chunk_max = tokens.size() / params.n_ctx; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); @@ -166,9 +206,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) { int count = 0; double nll = 0.0; + double nll2 = 0.0; fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); + std::vector workers(std::thread::hardware_concurrency() - 1); + for (int i = 0; i < n_chunk; ++i) { const int start = i * params.n_ctx; const int end = start + params.n_ctx; @@ -228,26 +271,32 @@ void perplexity(llama_context * ctx, const gpt_params & params) { // Example, we have a context window of 512, we will compute perplexity for each of the // last 256 tokens. Then, we split the input up into context window size chunks to // process the entire prompt. - for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) { - // Calculate probability of next token, given the previous ones. - const std::vector tok_logits( - logits.begin() + (j + 0) * n_vocab, - logits.begin() + (j + 1) * n_vocab); + const int first = std::min(512, params.n_ctx/2); + process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, workers, nll, nll2); + count += params.n_ctx - first - 1; - const float prob = softmax(tok_logits)[tokens[start + j + 1]]; - - nll += -std::log(prob); - ++count; - } // perplexity is e^(average negative log-likelihood) if (params.ppl_output_type == 0) { printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); } else { - printf("%8d %.4lf\n", i*params.n_ctx, std::exp(nll / count)); + double av = nll/count; + double av2 = nll2/count - av*av; + if (av2 > 0) av2 = sqrt(av2/(count-1)); + printf("%8d %.4lf %4lf %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2); } fflush(stdout); } printf("\n"); + nll2 /= count; + nll /= count; + nll2 -= nll * nll; + if (nll2 > 0) { + nll2 = sqrt(nll2/(count-1)); + double ppl = exp(nll); + printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + } else { + printf("Unexpected negative standard deviation of log(prob)\n"); + } } std::vector hellaswag_evaluate_tokens(llama_context * ctx, const std::vector& tokens, int n_past, int n_batch, @@ -306,6 +355,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + fprintf(stderr, "================================= is_spm = %d\n", is_spm); // This is needed as usual for LLaMA models const bool add_bos = is_spm; @@ -346,7 +396,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { hs_data[i].context = prompt_lines[idx*6]; hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); for (size_t j=0; j < 4; j++) { - hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j]; + hs_data[i].ending[j] = prompt_lines[idx*6+2+j]; } // Delete the selected random example from the prompt @@ -361,6 +411,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { double acc = 0.0f; const int n_vocab = llama_n_vocab(ctx); + std::vector> ending_tokens(4); + std::vector tok_logits(n_vocab); for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) { @@ -368,11 +420,21 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { std::vector context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos); size_t context_size = context_embd.size(); + for (int i = 0; i < 4; ++i) { + ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos); + for (int k = 0; k < int(context_size); ++k) { + if (ending_tokens[i][k] != context_embd[k]) { + fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k); + break; + } + } + } + // Do the 1st ending // In this case we include the context when evaluating - auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos); + //auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos); + auto query_embd = ending_tokens[0]; auto query_size = query_embd.size(); - //printf("First query: %d\n",(int)query_size); // Stop if query wont fit the ctx window if (query_size > (size_t)params.n_ctx) { @@ -417,7 +479,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) { // Tokenize the query - query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false); + query_embd.resize(ending_tokens[ending_idx].size() - context_size); + std::memcpy(query_embd.data(), ending_tokens[ending_idx].data() + context_size, query_embd.size()*sizeof(int)); query_size = query_embd.size(); // Stop if query wont fit the ctx window diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 3db61b754..573bc4ef9 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -87,7 +87,7 @@ int main(int argc, char ** argv) { } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; auto next_token = llama_sample_token(ctx, &candidates_p); - auto next_token_str = llama_token_to_str(ctx, next_token); + auto next_token_str = llama_token_to_piece(ctx, next_token); last_n_tokens_data.push_back(next_token); printf("%s", next_token_str.c_str()); @@ -147,7 +147,7 @@ int main(int argc, char ** argv) { } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; auto next_token = llama_sample_token(ctx2, &candidates_p); - auto next_token_str = llama_token_to_str(ctx2, next_token); + auto next_token_str = llama_token_to_piece(ctx2, next_token); last_n_tokens_data.push_back(next_token); printf("%s", next_token_str.c_str()); diff --git a/examples/server/README.md b/examples/server/README.md index 77997f98d..517608046 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -77,34 +77,31 @@ You need to have [Node.js](https://nodejs.org/en) installed. ```bash mkdir llama-client cd llama-client -npm init -npm install axios ``` Create a index.js file and put inside this: ```javascript -const axios = require("axios"); - const prompt = `Building a website can be done in 10 simple steps:`; async function Test() { - let result = await axios.post("http://127.0.0.1:8080/completion", { - prompt, - n_predict: 512, - }); - - // the response is received until completion finish - console.log(result.data.content); + let response = await fetch("http://127.0.0.1:8080/completion", { + method: 'POST', + body: JSON.stringify({ + prompt, + n_predict: 512, + }) + }) + console.log((await response.json()).content) } -Test(); +Test() ``` And run it: ```bash -node . +node index.js ``` ## API Endpoints @@ -167,6 +164,12 @@ node . Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`. +- **POST** `/detokenize`: Convert tokens to text. + + *Options:* + + `tokens`: Set the tokens to detokenize. + - **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. *Options:* diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 025b385cc..89a3311f5 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -94,7 +94,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) std::string ret; for (; begin != end; ++begin) { - ret += llama_token_to_str(ctx, *begin); + ret += llama_token_to_piece(ctx, *begin); } return ret; } @@ -123,7 +123,7 @@ static void server_log(const char *level, const char *function, int line, // format incomplete utf-8 multibyte character for output static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) { - std::string out = token == -1 ? "" : llama_token_to_str(ctx, token); + std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); // if the size is 1 and first bit is 1, meaning it's a partial character // (size > 1 meaning it's already a known token) if (out.size() == 1 && (out[0] & 0x80) == 0x80) @@ -286,7 +286,6 @@ struct llama_server_context std::vector p; if (first) { - s.insert(0, 1, ' '); // add a space if it's the first p = ::llama_tokenize(ctx, s, add_bos); first = false; } @@ -309,7 +308,6 @@ struct llama_server_context else { auto s = json_prompt.template get(); - s.insert(0, 1, ' '); // always add a first space prompt_tokens = ::llama_tokenize(ctx, s, add_bos); } @@ -566,7 +564,7 @@ struct llama_server_context if (!embd.empty() && embd.back() == llama_token_eos(ctx)) { - // stopping_word = llama_token_to_str(ctx, embd.back()); + // stopping_word = llama_token_to_piece(ctx, embd.back()); has_next_token = false; stopped_eos = true; LOG_VERBOSE("eos token found", {}); @@ -613,7 +611,7 @@ struct llama_server_context { const completion_token_output token_with_probs = nextToken(); - const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok); + const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok); generated_text += token_text; if (params.n_probs > 0) @@ -1104,6 +1102,12 @@ static json format_tokenizer_response(const std::vector &tokens) {"tokens", tokens}}; } +static json format_detokenized_response(std::string content) +{ + return json{ + {"content", content}}; +} + template static T json_value(const json &body, const std::string &key, const T &default_value) { @@ -1209,6 +1213,62 @@ static void log_server_request(const Request &req, const Response &res) }); } +bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) { + return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx); +} + +// Function matching type llama_beam_search_callback_fn_t. +// Custom callback example is called each time the beams lengths increase: +// * Show progress by printing ',' following by number of convergent beam tokens if any. +// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. +// This is also called when the stop condition is met. +// Collect tokens into std::vector response which is pointed to by callback_data. +void beam_search_callback(void * callback_data, llama_beams_state beams_state) { + auto & llama = *static_cast(callback_data); + // Mark beams as EOS as needed. + for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { + llama_beam_view& beam_view = beams_state.beam_views[i]; + if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) { + beam_view.eob = true; + } + } + printf(","); // Show progress + if (const size_t n = beams_state.common_prefix_length) { + llama.generated_token_probs.resize(llama.generated_token_probs.size() + n); + assert(0u < beams_state.n_beams); + const llama_token * tokens = beams_state.beam_views[0].tokens; + const auto map = [](llama_token tok) { return completion_token_output{{},tok}; }; + std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map); + printf("%lu", n); + } + fflush(stdout); +#if 0 // DEBUG: print current beams for this iteration + std::cout << "\n\nCurrent beams:\n"; + for (size_t i=0 ; i < beams_state.n_beams ; ++i) { + std::cout << "beams["< 0) { continue; } - const std::string token_text = llama_token_to_str(llama.ctx, token_with_probs.tok); + const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok); size_t pos = std::min(sent_count, llama.generated_text.size()); @@ -1437,6 +1505,21 @@ int main(int argc, char **argv) const json data = format_tokenizer_response(tokens); return res.set_content(data.dump(), "application/json"); }); + svr.Post("/detokenize", [&llama](const Request &req, Response &res) + { + auto lock = llama.lock(); + + const json body = json::parse(req.body); + std::string content; + if (body.count("tokens") != 0) + { + const std::vector tokens = body["tokens"]; + content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend()); + } + + const json data = format_detokenized_response(content); + return res.set_content(data.dump(), "application/json"); }); + svr.Post("/embedding", [&llama](const Request &req, Response &res) { auto lock = llama.lock(); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 132f7fbf9..4ee85faca 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -63,7 +63,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "\n\n"); for (auto id : tokens_list) { - fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str()); + fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -112,7 +112,7 @@ int main(int argc, char ** argv) { } // print the new token : - printf("%s", llama_token_to_str(ctx, new_token_id).c_str()); + printf("%s", llama_token_to_piece(ctx, new_token_id).c_str()); fflush(stdout); // push this new token for next evaluation diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 79b117df7..12d153417 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1964,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) { void print_token(struct llama_context * ctx, llama_token token) { - printf("%s", llama_token_to_str(ctx, token).c_str()); + printf("%s", llama_token_to_piece(ctx, token).c_str()); } void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) { @@ -2202,7 +2202,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto const char * in = buf.data(); const char * end = buf.data() + buf.size(); for (int i = 0; i < (int) out.size(); ++i) { - std::string s = llama_token_to_str(lctx, out[i]); + std::string s = llama_token_to_piece(lctx, out[i]); int len = s.length(); if (in >= end) { printf("%s: unexpected end of original text.\n", __func__); diff --git a/flake.lock b/flake.lock index 33164e096..a7777d05d 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "systems": "systems" }, "locked": { - "lastModified": 1685518550, - "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=", + "lastModified": 1692799911, + "narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=", "owner": "numtide", "repo": "flake-utils", - "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef", + "rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1685931219, - "narHash": "sha256-8EWeOZ6LKQfgAjB/USffUSELPRjw88A+xTcXnOUvO5M=", + "lastModified": 1692913444, + "narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "7409480d5c8584a1a83c422530419efe4afb0d19", + "rev": "18324978d632ffc55ef1d928e81630c620f4f447", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 616b90252..02095411e 100644 --- a/flake.nix +++ b/flake.nix @@ -6,6 +6,9 @@ outputs = { self, nixpkgs, flake-utils }: flake-utils.lib.eachDefaultSystem (system: let + name = "llama.cpp"; + src = ./.; + meta.mainProgram = "llama"; inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin; buildInputs = with pkgs; [ openmpi ]; osSpecific = with pkgs; buildInputs ++ @@ -21,11 +24,17 @@ CoreGraphics CoreVideo ] + else if isDarwin then + with pkgs.darwin.apple_sdk.frameworks; [ + Accelerate + CoreGraphics + CoreVideo + ] else with pkgs; [ openblas ] ); pkgs = import nixpkgs { inherit system; }; - nativeBuildInputs = with pkgs; [ cmake pkgconfig ]; + nativeBuildInputs = with pkgs; [ cmake ninja pkgconfig ]; llama-python = pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]); postPatch = '' @@ -38,35 +47,35 @@ mv $out/bin/server $out/bin/llama-server ''; cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]; - in { + in + { packages.default = pkgs.stdenv.mkDerivation { - name = "llama.cpp"; - src = ./.; - postPatch = postPatch; - nativeBuildInputs = nativeBuildInputs; - buildInputs = osSpecific; + inherit name src meta postPatch nativeBuildInputs buildInputs postInstall; cmakeFlags = cmakeFlags ++ (if isAarch64 && isDarwin then [ - "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" - "-DLLAMA_METAL=ON" - ] else [ - "-DLLAMA_BLAS=ON" - "-DLLAMA_BLAS_VENDOR=OpenBLAS" + "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" + "-DLLAMA_METAL=ON" + ] else [ + "-DLLAMA_BLAS=ON" + "-DLLAMA_BLAS_VENDOR=OpenBLAS" ]); - postInstall = postInstall; - meta.mainProgram = "llama"; }; packages.opencl = pkgs.stdenv.mkDerivation { - name = "llama.cpp"; - src = ./.; - postPatch = postPatch; - nativeBuildInputs = nativeBuildInputs; + inherit name src meta postPatch nativeBuildInputs postInstall; buildInputs = with pkgs; buildInputs ++ [ clblast ]; cmakeFlags = cmakeFlags ++ [ "-DLLAMA_CLBLAST=ON" ]; - postInstall = postInstall; - meta.mainProgram = "llama"; + }; + packages.rocm = pkgs.stdenv.mkDerivation { + inherit name src meta postPatch nativeBuildInputs postInstall; + buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ]; + cmakeFlags = cmakeFlags ++ [ + "-DLLAMA_HIPBLAS=1" + "-DCMAKE_C_COMPILER=hipcc" + "-DCMAKE_CXX_COMPILER=hipcc" + "-DCMAKE_POSITION_INDEPENDENT_CODE=ON" + ]; }; apps.llama-server = { type = "app"; @@ -80,8 +89,13 @@ type = "app"; program = "${self.packages.${system}.default}/bin/llama"; }; + apps.quantize = { + type = "app"; + program = "${self.packages.${system}.default}/bin/quantize"; + }; apps.default = self.apps.${system}.llama; devShells.default = pkgs.mkShell { + buildInputs = [ llama-python ]; packages = nativeBuildInputs ++ osSpecific; }; }); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 83d53c13c..d76a25dc2 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -306,11 +306,11 @@ typedef struct { #define QI4_K (QK_K / (4*QR4_K)) #ifdef GGML_QKK_64 typedef struct { - half d[2]; // super-block scales/mins + half dm[2]; // super-block scales/mins uint8_t scales[2]; // 4-bit block scales/mins uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; -static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding"); +static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding"); #else typedef struct { half2 dm; // super-block scale for quantized scales/mins @@ -737,8 +737,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float const int tid = threadIdx.x; const uint8_t * q = x[i].qs; float * y = yy + i*QK_K; - const float d = (float)x[i].d[0]; - const float m = (float)x[i].d[1]; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); #endif @@ -1155,8 +1155,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const uint16_t * a = (const uint16_t *)x[i].scales; aux16[0] = a[0] & 0x0f0f; aux16[1] = (a[0] >> 4) & 0x0f0f; - const float d = (float)x[i].d[0]; - const float m = (float)x[i].d[1]; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; float sum = 0.f; for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) @@ -2845,8 +2845,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( aux16[0] = a[0] & 0x0f0f; aux16[1] = (a[0] >> 4) & 0x0f0f; - const float dall = bq4_K->d[0]; - const float dmin = bq4_K->d[1]; + const float dall = bq4_K->dm[0]; + const float dmin = bq4_K->dm[1]; const float d8_1 = __low2float(bq8_1[0].ds); const float d8_2 = __low2float(bq8_1[1].ds); @@ -2929,7 +2929,11 @@ template static __device__ __forceinlin const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; +#if QK_K == 256 x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; +#else + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; +#endif } #pragma unroll @@ -3119,7 +3123,9 @@ template static __device__ __forceinlin const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; +#if QK_K == 256 x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; +#endif } #pragma unroll @@ -4709,6 +4715,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { +#if QK_K == 256 + int id; CUDA_CHECK(cudaGetDevice(&id)); const int compute_capability = g_compute_capabilities[id]; @@ -4740,6 +4748,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( mul_mat_q3_K<<>> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } +#endif } static void ggml_mul_mat_q4_K_q8_1_cuda( @@ -6328,9 +6337,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented const int mode = ((int32_t *) dst->op_params)[2]; const bool is_glm = mode & 4; + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm } diff --git a/ggml.c b/ggml.c index 8cb5c404f..855d519bf 100644 --- a/ggml.c +++ b/ggml.c @@ -19394,7 +19394,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i //////////////////////////////////////////////////////////////////////////////// struct gguf_str { - uint32_t n; + uint64_t n; // GGUFv2 char * data; }; @@ -19408,9 +19408,12 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = { [GGUF_TYPE_FLOAT32] = sizeof(float), [GGUF_TYPE_BOOL] = sizeof(bool), [GGUF_TYPE_STRING] = sizeof(struct gguf_str), + [GGUF_TYPE_UINT64] = sizeof(uint64_t), + [GGUF_TYPE_INT64] = sizeof(int64_t), + [GGUF_TYPE_FLOAT64] = sizeof(double), [GGUF_TYPE_ARRAY] = 0, // undefined }; -static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10"); +static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { [GGUF_TYPE_UINT8] = "u8", @@ -19423,8 +19426,11 @@ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { [GGUF_TYPE_BOOL] = "bool", [GGUF_TYPE_STRING] = "str", [GGUF_TYPE_ARRAY] = "arr", + [GGUF_TYPE_UINT64] = "u64", + [GGUF_TYPE_INT64] = "i64", + [GGUF_TYPE_FLOAT64] = "f64", }; -static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10"); +static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); union gguf_value { uint8_t uint8; @@ -19434,6 +19440,9 @@ union gguf_value { uint32_t uint32; int32_t int32; float float32; + uint64_t uint64; + int64_t int64; + double float64; bool bool_; struct gguf_str str; @@ -19441,7 +19450,7 @@ union gguf_value { struct { enum gguf_type type; - uint32_t n; + uint64_t n; // GGUFv2 void * data; } arr; }; @@ -19449,8 +19458,6 @@ union gguf_value { struct gguf_kv { struct gguf_str key; - uint32_t n_bytes; // TODO: is this actually needed? - enum gguf_type type; union gguf_value value; }; @@ -19458,15 +19465,15 @@ struct gguf_kv { struct gguf_header { uint32_t magic; uint32_t version; - uint32_t n_tensors; - uint32_t n_kv; + uint64_t n_tensors; // GGUFv2 + uint64_t n_kv; // GGUFv2 }; struct gguf_tensor_info { struct gguf_str name; uint32_t n_dims; - uint32_t ne[GGML_MAX_DIMS]; + uint64_t ne[GGML_MAX_DIMS]; enum ggml_type type; @@ -19497,19 +19504,32 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) return n == size; } -static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) { +// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 +static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) { p->n = 0; p->data = NULL; bool ok = true; - // TODO: how to avoid mallocs for strings? ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1); ok = ok && gguf_fread_el(file, p->data, p->n, offset); return ok; } +static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) { + p->n = 0; + p->data = NULL; + + bool ok = true; + + uint32_t n = 0; + ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n; + ok = ok && gguf_fread_el(file, p->data, p->n, offset); + + return ok; +} + struct gguf_context * gguf_init_empty(void) { struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); @@ -19565,8 +19585,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ctx->data = NULL; ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset); - ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); - ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); + + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t n_tensors = 0; + uint32_t n_kv = 0; + + ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset); + ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset); + + ctx->header.n_tensors = n_tensors; + ctx->header.n_kv = n_kv; + } else { + ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); + ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); + } if (!ok) { fprintf(stderr, "%s: failed to read header\n", __func__); @@ -19576,6 +19609,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } } + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur; + if (ctx->header.version == 1) { + gguf_fread_str = gguf_fread_str_v1; + } + // read the kv pairs { ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv)); @@ -19585,9 +19624,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p //fprintf(stderr, "%s: reading kv %d\n", __func__, i); - ok = ok && gguf_fread_str(file, &kv->key, &offset); - //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset); - ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset); + ok = ok && gguf_fread_str(file, &kv->key, &offset); + ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset); //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data); @@ -19599,12 +19637,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break; case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break; case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break; + case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break; + case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break; + case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break; case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break; case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break; case GGUF_TYPE_ARRAY: { ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); - ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); + + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t n = 0; + ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset); + kv->value.arr.n = n; + } else { + ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); + } switch (kv->value.arr.type) { case GGUF_TYPE_UINT8: @@ -19614,6 +19663,9 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p case GGUF_TYPE_UINT32: case GGUF_TYPE_INT32: case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: case GGUF_TYPE_BOOL: { kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); @@ -19660,7 +19712,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ok = ok && gguf_fread_str(file, &info->name, &offset); ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset); for (uint32_t j = 0; j < info->n_dims; ++j) { - ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t t = 0; + ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset); + info->ne[j] = t; + } else { + ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); + } } ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); @@ -19954,6 +20013,18 @@ float gguf_get_val_f32(struct gguf_context * ctx, int i) { return ctx->kv[i].value.float32; } +uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint64; +} + +int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int64; +} + +double gguf_get_val_f64(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.float64; +} + bool gguf_get_val_bool(struct gguf_context * ctx, int i) { return ctx->kv[i].value.bool_; } @@ -20056,6 +20127,27 @@ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) { ctx->kv[idx].value.float32 = val; } +void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT64; + ctx->kv[idx].value.uint64 = val; +} + +void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT64; + ctx->kv[idx].value.int64 = val; +} + +void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_FLOAT64; + ctx->kv[idx].value.float64 = val; +} + void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) { const int idx = gguf_get_or_add_key(ctx, key); @@ -20106,6 +20198,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break; case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break; case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break; + case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break; + case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break; + case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break; case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break; case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break; case GGUF_TYPE_ARRAY: @@ -20267,6 +20362,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break; case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break; case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break; + case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break; + case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break; + case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break; case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break; case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break; case GGUF_TYPE_ARRAY: @@ -20282,6 +20380,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, case GGUF_TYPE_UINT32: case GGUF_TYPE_INT32: case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: case GGUF_TYPE_BOOL: { gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); @@ -20516,6 +20617,14 @@ int ggml_cpu_has_sse3(void) { #endif } +int ggml_cpu_has_ssse3(void) { +#if defined(__SSSE3__) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_vsx(void) { #if defined(__POWER9_VECTOR__) return 1; diff --git a/ggml.h b/ggml.h index 421c0df60..792ca6e42 100644 --- a/ggml.h +++ b/ggml.h @@ -216,7 +216,7 @@ #define GGML_EXIT_ABORTED 1 #define GGUF_MAGIC 0x46554747 // "GGUF" -#define GGUF_VERSION 1 +#define GGUF_VERSION 2 #define GGUF_DEFAULT_ALIGNMENT 32 @@ -1827,6 +1827,9 @@ extern "C" { GGUF_TYPE_BOOL = 7, GGUF_TYPE_STRING = 8, GGUF_TYPE_ARRAY = 9, + GGUF_TYPE_UINT64 = 10, + GGUF_TYPE_INT64 = 11, + GGUF_TYPE_FLOAT64 = 12, GGUF_TYPE_COUNT, // marks the end of the enum }; @@ -1867,6 +1870,9 @@ extern "C" { GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i); GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i); GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i); + GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i); + GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i); + GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i); GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i); GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i); GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i); @@ -1886,6 +1892,9 @@ extern "C" { GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val); GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val); GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val); + GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val); + GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val); + GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val); GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val); GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val); GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n); @@ -1944,6 +1953,7 @@ extern "C" { GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_sse3 (void); + GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_vsx (void); // diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index f4db7001b..838a2c0f8 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -13,7 +13,7 @@ from typing import Any, IO, List, Optional # GGUF_MAGIC = 0x46554747 -GGUF_VERSION = 1 +GGUF_VERSION = 2 GGUF_DEFAULT_ALIGNMENT = 32 # general @@ -365,6 +365,9 @@ class GGUFValueType(IntEnum): BOOL = 7 STRING = 8 ARRAY = 9 + UINT64 = 10 + INT64 = 11 + FLOAT64 = 12 @staticmethod def get_type(val): @@ -378,6 +381,7 @@ class GGUFValueType(IntEnum): return GGUFValueType.BOOL elif isinstance(val, int): return GGUFValueType.INT32 + # TODO: need help with 64-bit types in Python else: print("Unknown type: "+str(type(val))) sys.exit() @@ -400,8 +404,8 @@ class GGUFWriter: def write_header_to_file(self): self.fout.write(struct.pack(" -#include -#include #endif #include "llama.h" @@ -62,6 +59,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -114,12 +114,17 @@ static size_t utf8_len(char src) { } void replace_all(std::string & s, const std::string & search, const std::string & replace) { - for (size_t pos = 0; ; pos += replace.length()) { - pos = s.find(search, pos); - if (pos == std::string::npos) break; - s.erase(pos, search.length()); - s.insert(pos, replace); + std::string result; + for (size_t pos = 0; ; pos += search.length()) { + auto new_pos = s.find(search, pos); + if (new_pos == std::string::npos) { + result += s.substr(pos, s.size() - pos); + break; + } + result += s.substr(pos, new_pos - pos) + replace; + pos = new_pos; } + s = std::move(result); } static void zeros(std::ofstream & file, size_t n) { @@ -796,12 +801,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } -static std::string llama_token_to_text(const struct llama_context * ctx, llama_token token) { +static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_str(ctx, token, result.data(), result.size()); + int check = llama_token_to_piece(ctx, token, result.data(), result.size()); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -955,10 +960,10 @@ struct llama_vocab { id linefeed_id = 13; int find_bpe_rank(std::string token_left, std::string token_right) const { - replace_all(token_left, " ", "Δ "); - replace_all(token_left, "\n", "Ċ"); - replace_all(token_right, " ", "Δ "); - replace_all(token_right, "\n", "Ċ"); + replace_all(token_left, " ", "\u0120"); + replace_all(token_left, "\n", "\u010A"); + replace_all(token_right, " ", "\u0120"); + replace_all(token_right, "\n", "\u010A"); auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); if (it == bpe_ranks.end()) { @@ -1144,11 +1149,13 @@ static bool llama_kv_cache_init( enum llama_fver { GGUF_FILE_VERSION_V1 = 1, + GGUF_FILE_VERSION_V2 = 2, }; static const char * llama_file_version_name(llama_fver version) { switch (version) { - case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)"; + case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; + case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)"; } return "unknown"; @@ -1635,7 +1642,8 @@ static void llm_load_hparams( } // TODO: This should probably be in llama.h -static std::vector llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape); +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos); +static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch); static void llm_load_vocab( llama_model_loader & ml, @@ -1737,7 +1745,11 @@ static void llm_load_vocab( } // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' - vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0]; + if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { + vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + } else { + vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0]; + } // special tokens GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID)); @@ -2635,18 +2647,20 @@ static struct ggml_cgraph * llm_build_falcon( const size_t wsize = ggml_type_size(cur->type); - struct ggml_tensor * tmpq = ggml_view_3d( + // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for + // non-contiguous views is added for the rope operator + struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head, N, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0); + 0)); offload_func_kq(tmpq); - struct ggml_tensor * tmpk = ggml_view_3d( + struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, N, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head); + wsize * n_embd_head * n_head)); offload_func_kq(tmpk); struct ggml_tensor * tmpv = ggml_view_3d( @@ -2831,7 +2845,6 @@ static bool llama_eval_internal( GGML_ASSERT(n_tokens > 0); GGML_ASSERT(n_past >= 0); - GGML_ASSERT(n_threads > 0); // TODO: keep the values of n_batch and n_ctx // GGML_ASSERT(n_tokens <= n_batch); // GGML_ASSERT(n_past + n_tokens <= n_ctx); @@ -2842,6 +2855,8 @@ static bool llama_eval_internal( ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); #endif + GGML_ASSERT(n_threads > 0); + const int N = n_tokens; const auto & model = lctx.model; @@ -3026,16 +3041,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { return vocab.token_to_id.at(buf); } -static std::string llama_escape_whitespace(const std::string& text) { - std::string result = "\xe2\x96\x81"; - for (size_t offs = 0; offs < text.length(); ++offs) { - if (text[offs] == ' ') { - result += "\xe2\x96\x81"; - } else { - result += text[offs]; - } - } - return result; +static void llama_escape_whitespace(std::string & text) { + replace_all(text, " ", "\xe2\x96\x81"); } static void llama_unescape_whitespace(std::string & word) { @@ -3219,7 +3226,7 @@ struct llm_bigram_bpe { }; struct llm_tokenizer_bpe { - llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; } + llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {} void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; @@ -3371,8 +3378,6 @@ private: return words; } - bool flag_g2ws = false; - const llama_vocab & vocab; std::vector symbols; @@ -3381,9 +3386,18 @@ private: llm_bigram_bpe::queue work_queue; }; -static std::vector llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) { +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) { std::vector output; + // OG tokenizer behavior: + // + // tokenizer.encode('', add_bos=True) returns [1] + // tokenizer.encode('', add_bos=False) returns [] + + if (bos && vocab.special_bos_id != -1) { + output.push_back(vocab.special_bos_id); + } + if (raw_text.empty()) { return output; } @@ -3391,29 +3405,16 @@ static std::vector llama_tokenize_internal(const llama_vocab & switch (vocab.type) { case LLAMA_VOCAB_TYPE_SPM: { + // without adding this leading whitespace, we do not get the same results as the original tokenizer + raw_text = " " + raw_text; + llm_tokenizer_spm tokenizer(vocab); - - if (bos) { - output.push_back(vocab.special_bos_id); - } - - std::string text; - if (escape) { - text = llama_escape_whitespace(raw_text); - } else { - text = raw_text; - } - - tokenizer.tokenize(text, output); + llama_escape_whitespace(raw_text); + tokenizer.tokenize(raw_text, output); } break; case LLAMA_VOCAB_TYPE_BPE: { - llm_tokenizer_bpe tokenizer(vocab, escape); - - if (bos && vocab.special_bos_id != -1) { - output.push_back(vocab.special_bos_id); - } - + llm_tokenizer_bpe tokenizer(vocab); tokenizer.tokenize(raw_text, output); } break; }; @@ -3908,7 +3909,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * // Calculate absolute value of second derivatives for (size_t i = 0; i < second_derivatives.size(); ++i) { - second_derivatives[i] = abs(second_derivatives[i]); + second_derivatives[i] = std::abs(second_derivatives[i]); } // Normalize the second derivatives @@ -4099,16 +4100,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c std::vector candidates_grammar; for (size_t i = 0; i < candidates->size; ++i) { - const llama_token id = candidates->data[i].id; - const std::string text = llama_token_to_text(ctx, id); + const llama_token id = candidates->data[i].id; + const std::string piece = llama_token_to_str(ctx, id); if (id == eos) { if (!allow_eos) { candidates->data[i].logit = -INFINITY; } - } else if (text.empty() || text[0] == 0) { + } else if (piece.empty() || piece[0] == 0) { candidates->data[i].logit = -INFINITY; } else { - candidates_decoded.push_back(decode_utf8(text.c_str(), grammar->partial_utf8)); + candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8)); candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second }); } } @@ -4312,10 +4313,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar GGML_ASSERT(false); } - const std::string text = llama_token_to_text(ctx, token); + const std::string piece = llama_token_to_str(ctx, token); // Note terminating 0 in decoded string - const auto decoded = decode_utf8(text.c_str(), grammar->partial_utf8); + const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8); const auto & code_points = decoded.first; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); @@ -4326,6 +4327,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar ctx->t_sample_us += ggml_time_us() - t_start_sample_us; } +// +// Beam search +// + +struct llama_beam { + std::vector tokens; + float p; // Cumulative beam probability (renormalized relative to all beams) + bool eob; // Initialize end-of-beam to false. Callback sets this to true. + // Sort beams by probability. In case of ties, prefer beams at eob. + bool operator<(const llama_beam & rhs) const { + return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob); + } + // Shift off first n tokens and discard them. + void shift_tokens(const size_t n) { + if (n) { + std::copy(tokens.begin() + n, tokens.end(), tokens.begin()); + tokens.resize(tokens.size() - n); + } + } + llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; } +}; + +// A struct for calculating logit-related info. +struct llama_logit_info { + const float * const logits; + const int n_vocab; + const float max_l; + const float normalizer; + struct sum_exp { + float max_l; + float operator()(float sum, float l) const { return sum + std::exp(l - max_l); } + }; + llama_logit_info(llama_context * ctx) + : logits(llama_get_logits(ctx)) + , n_vocab(llama_n_vocab(ctx)) + , max_l(*std::max_element(logits, logits + n_vocab)) + , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l})) + { } + llama_token_data get_token_data(const llama_token token_id) const { + constexpr auto p = std::numeric_limits::quiet_NaN(); // never used + return {token_id, logits[token_id], p}; + } + // Return top k token_data by logit. + std::vector top_k(size_t k) { + std::vector min_heap; // min-heap by logit + const llama_token k_min = std::min(static_cast(k), n_vocab); + min_heap.reserve(k_min); + for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) { + min_heap.push_back(get_token_data(token_id)); + } + auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }; + std::make_heap(min_heap.begin(), min_heap.end(), comp); + for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) { + if (min_heap.front().logit < logits[token_id]) { + std::pop_heap(min_heap.begin(), min_heap.end(), comp); + min_heap.back().id = token_id; + min_heap.back().logit = logits[token_id]; + std::push_heap(min_heap.begin(), min_heap.end(), comp); + } + } + return min_heap; + } + float probability_from_logit(float logit) { + return normalizer * std::exp(logit - max_l); + } +}; + +struct llama_beam_search_data { + llama_context * ctx; + size_t n_beams; + int n_past; + int n_predict; + int n_threads; + std::vector beams; + std::vector next_beams; + + // Re-calculated on each loop iteration + size_t common_prefix_length; + + // Used to communicate to/from callback on beams state. + std::vector beam_views; + + llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads) + : ctx(ctx) + , n_beams(n_beams) + , n_past(n_past) + , n_predict(n_predict) + , n_threads(n_threads) + , beam_views(n_beams) { + beams.reserve(n_beams); + next_beams.reserve(n_beams); + } + + // Collapse beams to a single beam given by index. + void collapse_beams(const size_t beam_idx) { + if (0u < beam_idx) { + std::swap(beams[0], beams[beam_idx]); + } + beams.resize(1); + } + + // Min-heaps are used to efficiently collect the top-k elements (k=n_beams). + // The repetative patterns below reflect the 2 stages of heaps: + // * Gather elements until the vector is full, then call std::make_heap() on it. + // * If the heap is full and a new element is found that should be included, pop the + // least element to the back(), replace it with the new, then push it into the heap. + void fill_next_beams_by_top_probabilities(llama_beam & beam) { + // Min-heaps use a greater-than comparator. + const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; }; + if (beam.eob) { + // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough. + if (next_beams.size() < n_beams) { + next_beams.push_back(std::move(beam)); + if (next_beams.size() == n_beams) { + std::make_heap(next_beams.begin(), next_beams.end(), comp); + } + } else if (next_beams.front().p < beam.p) { + std::pop_heap(next_beams.begin(), next_beams.end(), comp); + next_beams.back() = std::move(beam); + std::push_heap(next_beams.begin(), next_beams.end(), comp); + } + } else { + // beam is not at end-of-sentence, so branch with next top_k tokens. + if (!beam.tokens.empty()) { + llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads); + } + llama_logit_info logit_info(ctx); + std::vector next_tokens = logit_info.top_k(n_beams); + size_t i=0; + if (next_beams.size() < n_beams) { + for (; next_beams.size() < n_beams ; ++i) { + llama_beam next_beam = beam; + next_beam.tokens.push_back(next_tokens[i].id); + next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit); + next_beams.push_back(std::move(next_beam)); + } + std::make_heap(next_beams.begin(), next_beams.end(), comp); + } else { + for (; next_beams.front().p == 0.0f ; ++i) { + std::pop_heap(next_beams.begin(), next_beams.end(), comp); + next_beams.back() = beam; + next_beams.back().tokens.push_back(next_tokens[i].id); + next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit); + std::push_heap(next_beams.begin(), next_beams.end(), comp); + } + } + for (; i < n_beams ; ++i) { + const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit); + if (next_beams.front().p < next_p) { + std::pop_heap(next_beams.begin(), next_beams.end(), comp); + next_beams.back() = beam; + next_beams.back().tokens.push_back(next_tokens[i].id); + next_beams.back().p = next_p; + std::push_heap(next_beams.begin(), next_beams.end(), comp); + } + } + } + } + + // Find common_prefix_length based on beams. + // Requires beams is not empty. + size_t find_common_prefix_length() { + size_t common_prefix_length = beams[0].tokens.size(); + for (size_t i = 1 ; i < beams.size() ; ++i) { + common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size()); + for (size_t j = 0 ; j < common_prefix_length ; ++j) { + if (beams[0].tokens[j] != beams[i].tokens[j]) { + common_prefix_length = j; + break; + } + } + } + return common_prefix_length; + } + + // Construct beams_state to send back to caller via the callback function. + // Side effect: set common_prefix_length = find_common_prefix_length(); + llama_beams_state get_beams_state(const bool last_call) { + for (size_t i = 0 ; i < beams.size() ; ++i) { + beam_views[i] = beams[i].view(); + } + common_prefix_length = find_common_prefix_length(); + return {beam_views.data(), beams.size(), common_prefix_length, last_call}; + } + + // Loop: + // * while i < n_predict, AND + // * any of the beams have not yet reached end-of-beam (eob), AND + // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence + // (since all other beam probabilities can only decrease) + void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) { + beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob. + const auto not_eob = [](const llama_beam & beam) { return !beam.eob; }; + for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) && + !beams[top_beam_index()].eob ; ++i) { + callback(callback_data, get_beams_state(false)); // Sets common_prefix_length + update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed. + if (common_prefix_length) { + llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads); + n_past += common_prefix_length; + } + // Zero-out next_beam probabilities to place them last in following min-heap. + std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; }); + for (llama_beam & beam : beams) { + beam.shift_tokens(common_prefix_length); + fill_next_beams_by_top_probabilities(beam); + } + // next_beams become the beams of next/final iteration. Swap them to re-use memory. + beams.swap(next_beams); + renormalize_beam_probabilities(beams); + } + collapse_beams(top_beam_index()); + callback(callback_data, get_beams_state(true)); + } + + // As beams grow, the cumulative probabilities decrease. + // Renormalize them to avoid floating point underflow. + static void renormalize_beam_probabilities(std::vector & beams) { + const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; }; + const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p); + std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; }); + } + + // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering. + size_t top_beam_index() { + return std::max_element(beams.begin(), beams.end()) - beams.begin(); + } + + // Copy (p,eob) for each beam which may have been changed by the callback. + void update_beams_from_beam_views() { + for (size_t i = 0 ; i < beams.size() ; ++i) { + beams[i].p = beam_views[i].p; + beams[i].eob = beam_views[i].eob; + } + } +}; + +void llama_beam_search(llama_context * ctx, + llama_beam_search_callback_fn_t callback, void * callback_data, + size_t n_beams, int n_past, int n_predict, int n_threads) { + assert(ctx); + const int64_t t_start_sample_us = ggml_time_us(); + + llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads); + + beam_search_data.loop(callback, callback_data); + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->n_sample++; +} + // // quantization // @@ -4423,6 +4675,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::unique_ptr ml(new llama_model_loader(fname_inp, /*use_mmap*/ false)); + llama_model model; + llm_load_arch(*ml, model); + llm_load_hparams(*ml, model, 0, 0, 0); + const size_t align = GGUF_DEFAULT_ALIGNMENT; struct gguf_context * ctx_out = gguf_init_empty(); @@ -4448,6 +4704,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ++n_feed_forward_w2; } } + if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) { + LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n", + __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer); + } int i_attention_wv = 0; int i_feed_forward_w2 = 0; @@ -4524,8 +4784,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { int nx = tensor->ne[0]; - int ny = tensor->ne[1]; - if (nx % QK_K == 0 && ny % QK_K == 0) { + if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) { + new_type = GGML_TYPE_Q8_0; + } + else if (new_type != GGML_TYPE_Q8_0) { new_type = GGML_TYPE_Q6_K; } } else if (name.find("attn_v.weight") != std::string::npos) { @@ -4539,21 +4801,49 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; + if (model.type == MODEL_70B) { + // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is + // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with + // nearly negligible increase in model size by quantizing this tensor with more bits: + if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; + } ++i_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K + : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K + : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { + if (model.arch == LLM_ARCH_FALCON) { + new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : + use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else { + if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + } + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) { + new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K; ++i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + if (model.arch != LLM_ARCH_FALCON) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + } else { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + } + } + else if (name.find("attn_qkv.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; @@ -4568,8 +4858,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; - if (nx % QK_K != 0 || ny % QK_K != 0) { - LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); + if (nx % QK_K != 0) { + LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K); convert_incompatible_tensor = true; } } @@ -5297,13 +5587,29 @@ int llama_model_n_embd(const struct llama_model * model) { return model->hparams.n_embd; } -int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) { +int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { return snprintf(buf, buf_size, "%s %s %s", model->name.c_str(), llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str()); } +uint64_t llama_model_size(const struct llama_model * model) { + uint64_t size = 0; + for (const auto & it : model->tensors_by_name) { + size += ggml_nbytes(it.second); + } + return size; +} + +uint64_t llama_model_n_params(const struct llama_model * model) { + uint64_t nparams = 0; + for (const auto & it : model->tensors_by_name) { + nparams += ggml_nelements(it.second); + } + return nparams; +} + int llama_model_quantize( const char * fname_inp, const char * fname_out, @@ -5828,8 +6134,7 @@ int llama_tokenize_with_model( llama_token * tokens, int n_max_tokens, bool add_bos) { - auto escape = llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM; - auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape); + auto res = llama_tokenize_internal(model->vocab, text, add_bos); if (n_max_tokens < (int) res.size()) { LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); @@ -5843,12 +6148,12 @@ int llama_tokenize_with_model( return res.size(); } -int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) { - return llama_token_to_str_with_model(&ctx->model, token, buf, length); +int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) { + return llama_token_to_piece_with_model(&ctx->model, token, buf, length); } -// does not write null-terminator to str -int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { +// does not write null-terminator to buf +int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { if (0 <= token && token < llama_model_n_vocab(model)) { if (llama_is_normal_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; @@ -5936,6 +6241,7 @@ const char * llama_print_system_info(void) { s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; + s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; return s.c_str(); diff --git a/llama.h b/llama.h index 2bcf94e0f..b084fe23c 100644 --- a/llama.h +++ b/llama.h @@ -254,7 +254,11 @@ extern "C" { LLAMA_API int llama_model_n_embd (const struct llama_model * model); // Get a string describing the model type - LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size); + LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size); + // Returns the total size of all the tensors in the model in bytes + LLAMA_API uint64_t llama_model_size(const struct llama_model * model); + // Returns the total number of parameters in the model + LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); // Returns 0 on success LLAMA_API int llama_model_quantize( @@ -377,15 +381,17 @@ extern "C" { int n_max_tokens, bool add_bos); - // Token Id -> String. Uses the vocabulary in the provided context - // Does not write null terminator to the buffer - LLAMA_API int llama_token_to_str( + // Token Id -> Piece. + // Uses the vocabulary in the provided context. + // Does not write null terminator to the buffer. + // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. + LLAMA_API int llama_token_to_piece( const struct llama_context * ctx, llama_token token, char * buf, int length); - LLAMA_API int llama_token_to_str_with_model( + LLAMA_API int llama_token_to_piece_with_model( const struct llama_model * model, llama_token token, char * buf, @@ -465,6 +471,43 @@ extern "C" { /// @details Accepts the sampled token into the grammar LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token); + // + // Beam search + // + + struct llama_beam_view { + const llama_token * tokens; + size_t n_tokens; + float p; // Cumulative beam probability (renormalized relative to all beams) + bool eob; // Callback should set this to true when a beam is at end-of-beam. + }; + + // Passed to beam_search_callback function. + // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams + // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks. + // These pointers are valid only during the synchronous callback, so should not be saved. + struct llama_beams_state { + struct llama_beam_view * beam_views; + size_t n_beams; // Number of elements in beam_views[]. + size_t common_prefix_length; // Current max length of prefix tokens shared by all beams. + bool last_call; // True iff this is the last callback invocation. + }; + + // Type of pointer to the beam_search_callback function. + // void* callback_data is any custom data passed to llama_beam_search, that is subsequently + // passed back to beam_search_callback. This avoids having to use global variables in the callback. + typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, llama_beams_state); + + /// @details Deterministically returns entire sentence constructed by a beam search. + /// @param ctx Pointer to the llama_context. + /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state. + /// @param callback_data A pointer that is simply passed back to callback. + /// @param n_beams Number of beams to use. + /// @param n_past Number of tokens already evaluated. + /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier. + /// @param n_threads Number of threads as passed to llama_eval(). + LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads); + // Performance information LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); LLAMA_API void llama_print_timings(struct llama_context * ctx); diff --git a/scripts/convert-gg.sh b/scripts/convert-gg.sh new file mode 100755 index 000000000..01fda16fd --- /dev/null +++ b/scripts/convert-gg.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -e + +# LLaMA v1 +python3 convert.py ../llama1/7B --outfile models/llama-7b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16 + +# LLaMA v2 +python3 convert.py ../llama2/llama-2-7b --outfile models/llama-7b-v2/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16 + +# Code Llama +python3 convert.py ../codellama/CodeLlama-7b/ --outfile models/codellama-7b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16 + +# Falcon +python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b 1 +mv -v ../falcon/falcon-7b/ggml-model-f16.gguf models/falcon-7b/ggml-model-f16.gguf + +python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-40b 1 +mv -v ../falcon/falcon-40b/ggml-model-f16.gguf models/falcon-40b/ggml-model-f16.gguf diff --git a/scripts/qnt-all.sh b/scripts/qnt-all.sh index 1b3d07da5..1376e4194 100755 --- a/scripts/qnt-all.sh +++ b/scripts/qnt-all.sh @@ -20,6 +20,8 @@ fi model="$1" out="../tmp/results-${model}" +set -e + mkdir -p ${out} for q in ${qnt[@]}; do diff --git a/scripts/run-all-perf.sh b/scripts/run-all-perf.sh index 91a6d853f..7391e3dd5 100755 --- a/scripts/run-all-perf.sh +++ b/scripts/run-all-perf.sh @@ -20,6 +20,8 @@ fi model="$1" out="../tmp/results-${model}" +set -e + mkdir -p ${out} mstr="" diff --git a/scripts/run-all-ppl.sh b/scripts/run-all-ppl.sh index 366d0866c..f643ca3ae 100755 --- a/scripts/run-all-ppl.sh +++ b/scripts/run-all-ppl.sh @@ -17,6 +17,8 @@ if [ ! -z "$3" ]; then args="$3" fi +set -e + model="$1" out="../tmp/results-${model}" diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2afaf86b1..ca1f39d31 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -25,8 +25,10 @@ endfunction() llama_build_and_test_executable(test-quantize-fns.cpp) llama_build_and_test_executable(test-quantize-perf.cpp) llama_build_and_test_executable(test-sampling.cpp) -llama_build_executable(test-tokenizer-0.cpp) -llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) +llama_build_executable(test-tokenizer-0-llama.cpp) +llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) +llama_build_executable(test-tokenizer-0-falcon.cpp) +#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_build_executable(test-tokenizer-1.cpp) # test-tokenizer-1 requires a BPE vocab. re-enable when we have one. #llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp new file mode 100644 index 000000000..836fb8ad2 --- /dev/null +++ b/tests/test-tokenizer-0-falcon.cpp @@ -0,0 +1,178 @@ +#include "llama.h" +#include "common.h" + +#include +#include +#include +#include +#include + +// generate using test-tokenizer-0-falcon.py +static const std::map> & k_tests() { + static std::map> _k_tests = { + { "" , { }, }, + { " " , { 204, }, }, + { " " , { 258, }, }, + { " " , { 466, }, }, + { "\t" , { 192, }, }, + { "\n" , { 193, }, }, + { "\t\n" , { 19125, }, }, + { "Hello world" , { 9856, 1079, }, }, + { " Hello world" , { 23090, 1079, }, }, + { "Hello World" , { 9856, 2889, }, }, + { " Hello World" , { 23090, 2889, }, }, + { " Hello World!" , { 23090, 2889, 12, }, }, + { "Hello, world!" , { 9856, 23, 1079, 12, }, }, + { " Hello, world!" , { 23090, 23, 1079, 12, }, }, + { " this is πŸ¦™.cpp" , { 414, 304, 3346, 111, 231, 25, 29247, }, }, + { "w048 7tuijk dsdfhu" , { 98, 55866, 204, 34, 16682, 7149, 36190, 6869, 11481, }, }, + { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ" , { 150, 133, 6207, 151, 215, 150, 134, 5052, 133, 6279, 5052, 223, 151, 216, 49679, 123, 53110, 47043, 7795, }, }, + { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰" , { 38154, 206, 38154, 126, 38154, 225, 167, 237, 217, 38154, 221, 167, 237, 208, 38154, 228, 38154, 127, 38154, 237, 167, 237, 207, 38154, 237, 38154, 107, 38154, 126, 38154, 211, 38154, 207, 38154, 233, 38154, 211, 167, 237, 207, 38154, 215, }, }, + { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", { 2571, 232, 206, 204, 19, 11003, 20, 8196, 126, 283, 219, 48778, 116, 13392, 204, 19, 51831, 732, 63209, 1741, 7955, 522, 20, 22438, 211, 204, 19, 7927, 53360, 325, 504, 701, 946, 10930, 20, }, }, + { "Hello" , { 9856, }, }, + { " Hello" , { 23090, }, }, + { " Hello" , { 204, 23090, }, }, + { " Hello" , { 258, 23090, }, }, + { " Hello" , { 466, 23090, }, }, + { " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, + }; + + return _k_tests; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + std::string fname_text; + if (argc > 2) { + fname_text = argv[2]; + } + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(false); + + // load the vocab + { + auto lparams = llama_context_default_params(); + + lparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + + if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_BPE) { + fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__); + llama_free_model(model); + llama_free(ctx); + return 2; + } + + bool success = true; + + for (const auto & test_kv : k_tests()) { + const std::vector res = llama_tokenize(ctx, test_kv.first, false); + + printf("\n"); + printf("src: '%s'\n", test_kv.first.c_str()); + printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str()); + printf("tok: "); + for (const auto & tok : res) { + printf("%d ", tok); + } + printf("\n"); + + bool correct = res.size() == test_kv.second.size(); + + for (int i = 0; i < (int) res.size() && correct; ++i) { + if (test_kv.second[i] != res[i]) { + correct = false; + } + } + + if (!correct) { + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); + fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, + llama_detokenize_bpe(ctx, res).c_str(), + llama_detokenize_bpe(ctx, test_kv.second).c_str()); + fprintf(stderr, "%s : expected tokens: ", __func__); + for (const auto & t : test_kv.second) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : got tokens: ", __func__); + for (const auto & t : res) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + + success = false; + } + } + + if (!fname_text.empty()) { + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); + + std::string text; + { + std::ifstream ifs(fname_text); + if (!ifs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); + return 1; + } + text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); + } + + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); + + const std::vector res = llama_tokenize(ctx, text, true); + + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); + + { + const std::string fname_out = fname_text + ".tokcpp"; + + std::ofstream ofs(fname_out); + if (!ofs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return 1; + } + + for (const auto & tok : res) { + ofs << tok << " "; + } + + ofs << "\n"; + } + + fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + return success ? 0 : 3; +} diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py new file mode 100644 index 000000000..9c8c1c7d1 --- /dev/null +++ b/tests/test-tokenizer-0-falcon.py @@ -0,0 +1,83 @@ +# tests with BPE tokenizer + +import os +import sys +import argparse + +from transformers import AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize") +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer + +tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + +tests = [ + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is πŸ¦™.cpp", + "w048 7tuijk dsdfhu", + "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", + "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", + "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + ] + +for text in tests: + print('text: ', text) + print(tokenizer.encode(text)) + print(tokenizer.decode(tokenizer.encode(text))) + +print("\n\ntests for C++:\n") +for text in tests: + res = tokenizer.encode(text) + + k = text.replace('\n', '\\n') + k = k.replace('\t', '\\t') + k = '"' + k + '"' + print("{ %-24s, { " % k, end='') + for x in res: + print("%7d," % x, end='') + print(" }, },") + +print(tokenizer.encode('hello')) +print(tokenizer.encode('world')) +print(tokenizer.encode(' world')) +print(tokenizer.encode('hello world')) + +fname_tok = args.fname_tok +if fname_tok: + print('tokenizing file: ', fname_tok) + fname_out = fname_tok + '.tok' + with open(fname_tok, 'r') as f: + lines = f.readlines() + s = ''.join(lines) + res = tokenizer.encode(s) + # write to file + with open(fname_out, 'w') as f: + for x in res: + f.write(str(x) + ' ') + f.write('\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) + print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp new file mode 100644 index 000000000..8630742c6 --- /dev/null +++ b/tests/test-tokenizer-0-llama.cpp @@ -0,0 +1,182 @@ +#include "llama.h" +#include "common.h" + +#include +#include +#include +#include +#include + +// generate using test-tokenizer-0-llama.py +static const std::map> & k_tests() { + static std::map> _k_tests = { + { "" , { }, }, + { " " , { 259, }, }, + { " " , { 1678, }, }, + { " " , { 268, }, }, + { "\t" , { 29871, 12, }, }, + { "\n" , { 29871, 13, }, }, + { "\t\n" , { 29871, 12, 13, }, }, + { "Hello world" , { 15043, 3186, }, }, + { " Hello world" , { 29871, 15043, 3186, }, }, + { "Hello World" , { 15043, 2787, }, }, + { " Hello World" , { 29871, 15043, 2787, }, }, + { " Hello World!" , { 29871, 15043, 2787, 29991, }, }, + { "Hello, world!" , { 15043, 29892, 3186, 29991, }, }, + { " Hello, world!" , { 29871, 15043, 29892, 3186, 29991, }, }, + { " this is πŸ¦™.cpp" , { 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, + { "w048 7tuijk dsdfhu" , { 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, + { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ" , { 1538, 4851, 665, 1386, 29713, 1305, }, }, + { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰" , { 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, 136, 228, 162, 132, 228, 161, 140, }, }, + { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", { 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, + { "Hello" , { 15043, }, }, + { " Hello" , { 29871, 15043, }, }, + { " Hello" , { 259, 15043, }, }, + { " Hello" , { 1678, 15043, }, }, + { " Hello" , { 268, 15043, }, }, + { " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, }, + }; + + return _k_tests; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + std::string fname_text; + if (argc > 2) { + fname_text = argv[2]; + } + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(false); + + // load the vocab + { + auto lparams = llama_context_default_params(); + + lparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + + if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_SPM) { + fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__); + llama_free_model(model); + llama_free(ctx); + return 2; + } + + bool success = true; + + for (const auto & test_kv : k_tests()) { + const std::vector res_bos = llama_tokenize(ctx, test_kv.first, true); + const std::vector res_nobos = llama_tokenize(ctx, test_kv.first, false); + + printf("\n"); + printf("src: '%s'\n", test_kv.first.c_str()); + printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str()); + printf("tok: "); + for (const auto & tok : res_bos) { + printf("%d ", tok); + } + printf("\n"); + + bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1; + + for (int i = 0; i < (int) res_nobos.size() && correct; ++i) { + if (test_kv.second[i] != res_bos[i + 1]) { + correct = false; + } + if (test_kv.second[i] != res_nobos[i]) { + correct = false; + } + } + + if (!correct) { + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); + fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, + llama_detokenize_spm(ctx, res_nobos).c_str(), + llama_detokenize_spm(ctx, test_kv.second).c_str()); + fprintf(stderr, "%s : expected tokens: ", __func__); + for (const auto & t : test_kv.second) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : got tokens: ", __func__); + for (const auto & t : res_nobos) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + + success = false; + } + } + + if (!fname_text.empty()) { + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); + + std::string text; + { + std::ifstream ifs(fname_text); + if (!ifs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); + return 1; + } + text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); + } + + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); + + const std::vector res = llama_tokenize(ctx, text, true); + + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); + + { + const std::string fname_out = fname_text + ".tokcpp"; + + std::ofstream ofs(fname_out); + if (!ofs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return 1; + } + + for (const auto & tok : res) { + ofs << tok << " "; + } + + ofs << "\n"; + } + + fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + return success ? 0 : 3; +} diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py new file mode 100644 index 000000000..bc164ee29 --- /dev/null +++ b/tests/test-tokenizer-0-llama.py @@ -0,0 +1,95 @@ +# tests with SPM tokenizer + +import os +import sys +import argparse + +from sentencepiece import SentencePieceProcessor + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize") +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer + +tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') + +tests = [ + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is πŸ¦™.cpp", + "w048 7tuijk dsdfhu", + "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", + "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", + "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + ] + + +for text in tests: + print('text: ', text) + print('\nwith bos:') + print(tokenizer.encode(text, add_bos=True)) + print(tokenizer.decode(tokenizer.encode(text, add_bos=True))) + print('\nwithout bos:') + print(tokenizer.encode(text, add_bos=False)) + print(tokenizer.decode(tokenizer.encode(text, add_bos=False))) + +print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello' +print("'" + tokenizer.id_to_piece(29871) + "'") # '_' +print("'" + tokenizer.decode([15043]) + "'") # 'Hello' +print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello' +print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello' +print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello' + +print("\n\ntests for C++:\n") +for text in tests: + res = tokenizer.encode(text, add_bos=False) + + k = text.replace('\n', '\\n') + k = k.replace('\t', '\\t') + k = '"' + k + '"' + print("{ %-24s, { " % k, end='') + for x in res: + print("%7d," % x, end='') + print(" }, },") + +print(tokenizer.encode('hello')) +print(tokenizer.encode('world')) +print(tokenizer.encode(' world')) +print(tokenizer.encode('hello world')) + +fname_tok = args.fname_tok +if fname_tok: + print('tokenizing file: ', fname_tok) + fname_out = fname_tok + '.tok' + with open(fname_tok, 'r') as f: + lines = f.readlines() + s = ''.join(lines) + res = tokenizer.encode(s, add_bos=True) + # write to file + with open(fname_out, 'w') as f: + for x in res: + f.write(str(x) + ' ') + f.write('\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) + print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp deleted file mode 100644 index f3ee851a3..000000000 --- a/tests/test-tokenizer-0.cpp +++ /dev/null @@ -1,140 +0,0 @@ -#include "llama.h" -#include "common.h" - -#include -#include -#include -#include - -static std::string unescape_whitespace(llama_context* ctx, const std::vector& tokens) { - std::string result; - for (size_t i = 0; i < tokens.size(); ++i) { - result += llama_token_to_str(ctx, tokens[i]); - } - return result; -} - -static const std::map> & k_tests() { - static std::map> _k_tests = { - { " ", {1, 259, }, }, - { " ", { 1, 1678, }, }, - { " ", { 1, 268, }, }, - { "\t", { 1, 29871, 12, }, }, - { "\n", { 1, 29871, 13, }, }, - { "\t\n", { 1, 29871, 12, 13, }, }, - { "Hello world", { 1, 15043, 3186, }, }, - { " Hello world", { 1, 29871, 15043, 3186, }, }, - { "Hello World", { 1, 15043, 2787, }, }, - { " Hello World", { 1, 29871, 15043, 2787, }, }, - { " Hello World!", { 1, 29871, 15043, 2787, 29991, }, }, - { " this is πŸ¦™.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, - { "w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, - { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, }, - { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", { 1, 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, - 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, - 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, - 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, - 136, 228, 162, 132, 228, 161, 140, }, }, - { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", - { 1, 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, - 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, - 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, - 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, - { "Hello", { 1, 15043 }, }, - { " Hello", { 1, 29871, 15043 }, }, - { " Hello", { 1, 259, 15043 }, }, - { " Hello", { 1, 1678, 15043 }, }, - { " Hello", { 1, 268, 15043 }, }, - { " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, }, - }; - - return _k_tests; -} - -int main(int argc, char **argv) { - if (argc < 2) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return 1; - } - - const std::string fname = argv[1]; - - fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); - - llama_model * model; - llama_context * ctx; - - llama_backend_init(false); - - // load the vocab - { - auto lparams = llama_context_default_params(); - - lparams.vocab_only = true; - - model = llama_load_model_from_file(fname.c_str(), lparams); - - if (model == NULL) { - fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - return 1; - } - - ctx = llama_new_context_with_model(model, lparams); - - if (ctx == NULL) { - fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - llama_free_model(model); - return 1; - } - } - - const int n_vocab = llama_n_vocab(ctx); - - if (n_vocab != 32000) { - fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab); - llama_free_model(model); - llama_free(ctx); - return 2; - } - - bool success = true; - - for (const auto & test_kv : k_tests()) { - std::vector res = llama_tokenize(ctx, test_kv.first, true); - fprintf(stderr, "%s : '%s' tokenized to '%s'\n", - __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str()); - - bool correct = res.size() == test_kv.second.size(); - - for (int i = 0; i < (int) res.size() && correct; ++i) { - if (res[i] != test_kv.second[i]) { - correct = false; - } - } - - if (!correct) { - fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); - fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, - unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str()); - fprintf(stderr, "%s : expected tokens: ", __func__); - for (const auto & t : test_kv.second) { - fprintf(stderr, "%6d, ", t); - } - fprintf(stderr, "\n"); - fprintf(stderr, "%s : got tokens: ", __func__); - for (const auto & t : res) { - fprintf(stderr, "%6d, ", t); - } - fprintf(stderr, "\n"); - - success = false; - } - } - - llama_free_model(model); - llama_free(ctx); - - llama_backend_free(); - - return success ? 0 : 3; -} diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp index bd607d12b..ce4f2898c 100644 --- a/tests/test-tokenizer-1.cpp +++ b/tests/test-tokenizer-1.cpp @@ -22,14 +22,6 @@ static std::string escape_whitespace(const std::string& text) { return result; } -static std::string unescape_whitespace(llama_context * ctx, const std::vector & tokens) { - std::string result; - for (size_t i = 0; i < tokens.size(); ++i) { - result += llama_token_to_str(ctx, tokens[i]); - } - return result; -} - int main(int argc, char **argv) { if (argc < 2) { fprintf(stderr, "Usage: %s \n", argv[0]); @@ -72,13 +64,13 @@ int main(int argc, char **argv) { const int n_vocab = llama_n_vocab(ctx); for (int i = 0; i < n_vocab; ++i) { - std::string forward = llama_token_to_str(ctx, i); + std::string forward = llama_token_to_piece(ctx, i); std::vector tokens = llama_tokenize(ctx, forward, false); if (tokens.size() == 1) { if (i != tokens[0]) { - std::string backward = llama_token_to_str(ctx, tokens[0]); + std::string backward = llama_token_to_piece(ctx, tokens[0]); fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n", - __func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str()); + __func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str()); return 2; } }