Merge branch 'ggerganov:master' into systemd-units
This commit is contained in:
commit
46395e6311
40 changed files with 2278 additions and 624 deletions
|
@ -11,6 +11,10 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
|
- #### IMPORTANT: Tokenizer fixes and API change (developers and projects using `llama.cpp` built-in tokenization must read): https://github.com/ggerganov/llama.cpp/pull/2810
|
||||||
|
|
||||||
|
- GGUFv2 adds support for 64-bit sizes + backwards compatible: https://github.com/ggerganov/llama.cpp/pull/2821
|
||||||
|
|
||||||
- Added support for Falcon models: https://github.com/ggerganov/llama.cpp/pull/2717
|
- Added support for Falcon models: https://github.com/ggerganov/llama.cpp/pull/2717
|
||||||
|
|
||||||
- A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
|
- A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
|
||||||
|
|
140
ci/run.sh
140
ci/run.sh
|
@ -196,17 +196,17 @@ function gg_run_open_llama_3b_v2 {
|
||||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -233,6 +233,48 @@ function gg_run_open_llama_3b_v2 {
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
|
||||||
|
# lora
|
||||||
|
function compare_ppl {
|
||||||
|
qnt="$1"
|
||||||
|
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
|
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
||||||
|
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
||||||
|
return 20
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
path_lora="../models-mnt/open-llama/3B-v2/lora"
|
||||||
|
path_shakespeare="../models-mnt/shakespeare"
|
||||||
|
|
||||||
|
shakespeare="${path_shakespeare}/shakespeare.txt"
|
||||||
|
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
||||||
|
|
||||||
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
||||||
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
||||||
|
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
||||||
|
|
||||||
|
python3 ../convert-lora-to-ggml.py ${path_lora}
|
||||||
|
|
||||||
|
# f16
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
||||||
|
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
# q8_0
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
||||||
|
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
# q8_0 + f16 lora-base
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||||
|
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -242,6 +284,7 @@ function gg_sum_open_llama_3b_v2 {
|
||||||
gg_printf 'OpenLLaMA 3B-v2:\n'
|
gg_printf 'OpenLLaMA 3B-v2:\n'
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
|
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
@ -253,6 +296,11 @@ function gg_sum_open_llama_3b_v2 {
|
||||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
|
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
||||||
|
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
||||||
|
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
||||||
|
gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
||||||
|
gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
||||||
}
|
}
|
||||||
|
|
||||||
# open_llama_7b_v2
|
# open_llama_7b_v2
|
||||||
|
@ -310,17 +358,17 @@ function gg_run_open_llama_7b_v2 {
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/main --model ${model_q8_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/main --model ${model_q4_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/main --model ${model_q4_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/main --model ${model_q5_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/main --model ${model_q5_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/main --model ${model_q2_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/main --model ${model_q3_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/main --model ${model_q4_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/main --model ${model_q5_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/main --model ${model_q6_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
@ -359,6 +407,48 @@ function gg_run_open_llama_7b_v2 {
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
|
||||||
|
# lora
|
||||||
|
function compare_ppl {
|
||||||
|
qnt="$1"
|
||||||
|
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
|
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
||||||
|
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
||||||
|
return 20
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
path_lora="../models-mnt/open-llama/7B-v2/lora"
|
||||||
|
path_shakespeare="../models-mnt/shakespeare"
|
||||||
|
|
||||||
|
shakespeare="${path_shakespeare}/shakespeare.txt"
|
||||||
|
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
||||||
|
|
||||||
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
||||||
|
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
||||||
|
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
||||||
|
|
||||||
|
python3 ../convert-lora-to-ggml.py ${path_lora}
|
||||||
|
|
||||||
|
# f16
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
||||||
|
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
# currently not supported by the CUDA backend
|
||||||
|
# q8_0
|
||||||
|
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
||||||
|
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
||||||
|
#compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
|
# q8_0 + f16 lora-base
|
||||||
|
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||||
|
#compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -368,6 +458,7 @@ function gg_sum_open_llama_7b_v2 {
|
||||||
gg_printf 'OpenLLaMA 7B-v2:\n'
|
gg_printf 'OpenLLaMA 7B-v2:\n'
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
|
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
@ -379,6 +470,11 @@ function gg_sum_open_llama_7b_v2 {
|
||||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
|
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
||||||
|
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
||||||
|
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
||||||
|
#gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
||||||
|
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
||||||
}
|
}
|
||||||
|
|
||||||
## main
|
## main
|
||||||
|
|
|
@ -733,12 +733,12 @@ std::vector<llama_token> llama_tokenize(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
||||||
std::vector<char> result(8, 0);
|
std::vector<char> result(8, 0);
|
||||||
const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
|
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
result.resize(-n_tokens);
|
result.resize(-n_tokens);
|
||||||
int check = llama_token_to_str(ctx, token, result.data(), result.size());
|
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||||
GGML_ASSERT(check == -n_tokens);
|
GGML_ASSERT(check == -n_tokens);
|
||||||
} else {
|
} else {
|
||||||
result.resize(n_tokens);
|
result.resize(n_tokens);
|
||||||
|
@ -746,3 +746,36 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok
|
||||||
|
|
||||||
return std::string(result.data(), result.size());
|
return std::string(result.data(), result.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||||
|
const llama_token bos_id = llama_token_bos(ctx);
|
||||||
|
|
||||||
|
std::string piece;
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||||
|
piece = llama_token_to_piece(ctx, tokens[i]);
|
||||||
|
|
||||||
|
// remove the leading space of the first non-BOS token
|
||||||
|
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
|
||||||
|
piece = piece.substr(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
result += piece;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||||
|
std::string piece;
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||||
|
piece = llama_token_to_piece(ctx, tokens[i]);
|
||||||
|
|
||||||
|
result += piece;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
|
@ -28,6 +28,7 @@ struct gpt_params {
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
|
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||||
float rope_freq_base = 10000.0f; // RoPE base frequency
|
float rope_freq_base = 10000.0f; // RoPE base frequency
|
||||||
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
||||||
|
|
||||||
|
@ -115,11 +116,31 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
// Vocab utils
|
// Vocab utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// tokenizes a string into a vector of tokens
|
||||||
|
// should work similar to Python's `tokenizer.encode`
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> llama_tokenize(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_bos);
|
bool add_bos);
|
||||||
|
|
||||||
std::string llama_token_to_str(
|
// tokenizes a token into a piece
|
||||||
|
// should work similar to Python's `tokenizer.id_to_piece`
|
||||||
|
std::string llama_token_to_piece(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
llama_token token);
|
llama_token token);
|
||||||
|
|
||||||
|
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
|
||||||
|
// that takes into account the tokenizer type and decides how to handle the leading space
|
||||||
|
//
|
||||||
|
// detokenizes a vector of tokens into a string
|
||||||
|
// should work similar to Python's `tokenizer.decode`
|
||||||
|
// removes the leading space from the first non-BOS token
|
||||||
|
std::string llama_detokenize_spm(
|
||||||
|
llama_context * ctx,
|
||||||
|
const std::vector<llama_token> & tokens);
|
||||||
|
|
||||||
|
// detokenizes a vector of tokens into a string
|
||||||
|
// should work similar to Python's `tokenizer.decode`
|
||||||
|
std::string llama_detokenize_bpe(
|
||||||
|
llama_context * ctx,
|
||||||
|
const std::vector<llama_token> & tokens);
|
||||||
|
|
251
convert.py
251
convert.py
|
@ -3,6 +3,7 @@
|
||||||
import gguf
|
import gguf
|
||||||
import argparse
|
import argparse
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
|
||||||
import copy
|
import copy
|
||||||
import enum
|
import enum
|
||||||
import faulthandler
|
import faulthandler
|
||||||
|
@ -17,13 +18,14 @@ import re
|
||||||
import signal
|
import signal
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
import zipfile
|
import zipfile
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from abc import ABCMeta, abstractmethod
|
from abc import ABCMeta, abstractmethod
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union)
|
from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, TypeVar, Union)
|
||||||
from sentencepiece import SentencePieceProcessor # type: ignore
|
from sentencepiece import SentencePieceProcessor # type: ignore
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
@ -37,30 +39,70 @@ NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
|
||||||
ARCH=gguf.MODEL_ARCH.LLAMA
|
ARCH=gguf.MODEL_ARCH.LLAMA
|
||||||
NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
|
NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
|
||||||
|
|
||||||
|
DEFAULT_CONCURRENCY = 8
|
||||||
#
|
#
|
||||||
# data types
|
# data types
|
||||||
#
|
#
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class UnquantizedDataType:
|
class DataType:
|
||||||
name: str
|
name: str
|
||||||
|
dtype: 'np.dtype[Any]'
|
||||||
|
valid_conversions: List[str]
|
||||||
|
|
||||||
DT_F16 = UnquantizedDataType('F16')
|
def elements_to_bytes(self, n_elements: int) -> int:
|
||||||
DT_F32 = UnquantizedDataType('F32')
|
return n_elements * self.dtype.itemsize
|
||||||
DT_I32 = UnquantizedDataType('I32')
|
|
||||||
DT_BF16 = UnquantizedDataType('BF16')
|
|
||||||
|
|
||||||
DataType = Union[UnquantizedDataType]
|
@dataclass(frozen=True)
|
||||||
|
class UnquantizedDataType(DataType):
|
||||||
|
pass
|
||||||
|
|
||||||
DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
|
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
||||||
DT_BF16: np.dtype(np.uint16),
|
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
||||||
DT_F16: np.dtype(np.float16),
|
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
||||||
DT_F32: np.dtype(np.float32),
|
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
||||||
DT_I32: np.dtype(np.int32),
|
|
||||||
}
|
|
||||||
|
|
||||||
NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
|
@dataclass(frozen=True)
|
||||||
{dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
|
class QuantizedDataType(DataType):
|
||||||
|
block_size: int
|
||||||
|
quantized_dtype: 'np.dtype[Any]'
|
||||||
|
ggml_type: gguf.GGMLQuantizationType
|
||||||
|
|
||||||
|
def quantize(self, arr: NDArray) -> NDArray:
|
||||||
|
raise NotImplementedError(f'Quantization for {self.name} not implemented')
|
||||||
|
|
||||||
|
def elements_to_bytes(self, n_elements: int) -> int:
|
||||||
|
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
||||||
|
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Q8_0QuantizedDataType(QuantizedDataType):
|
||||||
|
# Mini Q8_0 quantization in Python!
|
||||||
|
def quantize(self, arr: NDArray) -> NDArray:
|
||||||
|
assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
|
||||||
|
assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
|
||||||
|
n_blocks = arr.size // self.block_size
|
||||||
|
blocks = arr.reshape((n_blocks, self.block_size))
|
||||||
|
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
||||||
|
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[Tuple[Any, Any]]:
|
||||||
|
d = abs(blocks).max(axis = 1) / np.float32(127)
|
||||||
|
with np.errstate(divide = 'ignore'):
|
||||||
|
qs = (blocks / d[:, None]).round()
|
||||||
|
qs[d == 0] = 0
|
||||||
|
yield from zip(d, qs)
|
||||||
|
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
||||||
|
|
||||||
|
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
||||||
|
dtype = np.dtype(np.float32), valid_conversions = [],
|
||||||
|
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
||||||
|
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
|
||||||
|
|
||||||
|
# Quantized types skipped here because they may also map to np.float32
|
||||||
|
NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = {}
|
||||||
|
for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
|
||||||
|
if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
|
||||||
|
raise ValueError(f'Invalid duplicate data type {dt}')
|
||||||
|
NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt
|
||||||
|
|
||||||
SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
|
SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
|
||||||
'BF16': DT_BF16,
|
'BF16': DT_BF16,
|
||||||
|
@ -73,20 +115,22 @@ SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
|
||||||
# TODO: rename to LLAMAFileType
|
# TODO: rename to LLAMAFileType
|
||||||
# TODO: move to `gguf.py`
|
# TODO: move to `gguf.py`
|
||||||
class GGMLFileType(enum.IntEnum):
|
class GGMLFileType(enum.IntEnum):
|
||||||
AllF32 = 0
|
AllF32 = 0
|
||||||
MostlyF16 = 1 # except 1d tensors
|
MostlyF16 = 1 # except 1d tensors
|
||||||
|
MostlyQ8_0 = 7 # except 1d tensors
|
||||||
|
|
||||||
def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
|
def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
|
||||||
if len(tensor.shape) == 1:
|
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
|
||||||
# 1D tensors are always F32.
|
if dt is None:
|
||||||
return DT_F32
|
|
||||||
elif self == GGMLFileType.AllF32:
|
|
||||||
return DT_F32
|
|
||||||
elif self == GGMLFileType.MostlyF16:
|
|
||||||
return DT_F16
|
|
||||||
else:
|
|
||||||
raise ValueError(self)
|
raise ValueError(self)
|
||||||
|
# 1D tensors are always F32.
|
||||||
|
return dt if len(tensor.shape) > 1 else DT_F32
|
||||||
|
|
||||||
|
GGML_FILE_TYPE_TO_DATA_TYPE: Dict[GGMLFileType, DataType] = {
|
||||||
|
GGMLFileType.AllF32 : DT_F32,
|
||||||
|
GGMLFileType.MostlyF16 : DT_F16,
|
||||||
|
GGMLFileType.MostlyQ8_0: DT_Q8_0,
|
||||||
|
}
|
||||||
|
|
||||||
#
|
#
|
||||||
# hparams loading
|
# hparams loading
|
||||||
|
@ -105,6 +149,7 @@ class Params:
|
||||||
f_norm_eps: float
|
f_norm_eps: float
|
||||||
|
|
||||||
f_rope_freq_base: Optional[float] = None
|
f_rope_freq_base: Optional[float] = None
|
||||||
|
f_rope_scale: Optional[float] = None
|
||||||
|
|
||||||
ftype: Optional[GGMLFileType] = None
|
ftype: Optional[GGMLFileType] = None
|
||||||
|
|
||||||
|
@ -160,13 +205,20 @@ class Params:
|
||||||
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
|
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
|
||||||
config = json.load(open(config_path))
|
config = json.load(open(config_path))
|
||||||
|
|
||||||
n_vocab = config["vocab_size"]
|
n_vocab = config["vocab_size"]
|
||||||
n_embd = config["hidden_size"]
|
n_embd = config["hidden_size"]
|
||||||
n_layer = config["num_hidden_layers"]
|
n_layer = config["num_hidden_layers"]
|
||||||
n_ff = config["intermediate_size"]
|
n_ff = config["intermediate_size"]
|
||||||
n_head = config["num_attention_heads"]
|
n_head = config["num_attention_heads"]
|
||||||
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
|
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
|
||||||
f_norm_eps = config["rms_norm_eps"]
|
f_norm_eps = config["rms_norm_eps"]
|
||||||
|
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
|
||||||
|
|
||||||
|
rope_scaling = config.get("rope_scaling")
|
||||||
|
if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
|
||||||
|
f_rope_scale = config["rope_scaling"].get("factor")
|
||||||
|
else:
|
||||||
|
f_rope_scale = None
|
||||||
|
|
||||||
n_mult = Params.find_n_mult(n_ff, n_embd)
|
n_mult = Params.find_n_mult(n_ff, n_embd)
|
||||||
|
|
||||||
|
@ -179,15 +231,17 @@ class Params:
|
||||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = n_vocab,
|
n_vocab = n_vocab,
|
||||||
n_embd = n_embd,
|
n_embd = n_embd,
|
||||||
n_mult = n_mult,
|
n_mult = n_mult,
|
||||||
n_layer = n_layer,
|
n_layer = n_layer,
|
||||||
n_ctx = n_ctx,
|
n_ctx = n_ctx,
|
||||||
n_ff = n_ff,
|
n_ff = n_ff,
|
||||||
n_head = n_head,
|
n_head = n_head,
|
||||||
n_head_kv = n_head_kv,
|
n_head_kv = n_head_kv,
|
||||||
f_norm_eps = f_norm_eps,
|
f_norm_eps = f_norm_eps,
|
||||||
|
f_rope_freq_base = f_rope_freq_base,
|
||||||
|
f_rope_scale = f_rope_scale,
|
||||||
)
|
)
|
||||||
|
|
||||||
# LLaMA v2 70B params.json
|
# LLaMA v2 70B params.json
|
||||||
|
@ -405,7 +459,7 @@ class UnquantizedTensor(Tensor):
|
||||||
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
|
||||||
|
|
||||||
def astype(self, data_type: DataType) -> Tensor:
|
def astype(self, data_type: DataType) -> Tensor:
|
||||||
dtype = DATA_TYPE_TO_NUMPY[data_type]
|
dtype = data_type.dtype
|
||||||
if self.data_type == DT_BF16:
|
if self.data_type == DT_BF16:
|
||||||
self.ndarray = bf16_to_fp32(self.ndarray)
|
self.ndarray = bf16_to_fp32(self.ndarray)
|
||||||
return UnquantizedTensor(self.ndarray.astype(dtype))
|
return UnquantizedTensor(self.ndarray.astype(dtype))
|
||||||
|
@ -444,22 +498,6 @@ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, conv
|
||||||
GGMLCompatibleTensor = Union[UnquantizedTensor]
|
GGMLCompatibleTensor = Union[UnquantizedTensor]
|
||||||
|
|
||||||
|
|
||||||
class DeferredPermutedTensor(Tensor):
|
|
||||||
def __init__(self, base: Tensor, n_head: int, n_head_kv: int) -> None:
|
|
||||||
self.base = base
|
|
||||||
self.n_head = n_head
|
|
||||||
self.data_type = self.base.data_type
|
|
||||||
|
|
||||||
def astype(self, data_type: DataType) -> Tensor:
|
|
||||||
return self.base.astype(data_type).permute(self.n_head, self.n_head_kv)
|
|
||||||
|
|
||||||
def to_ggml(self) -> GGMLCompatibleTensor:
|
|
||||||
return self.base.to_ggml().permute(self.n_head, self.n_head_kv)
|
|
||||||
|
|
||||||
def permute(self, n_head: int, n_head_kv: int) -> Tensor:
|
|
||||||
raise Exception("shouldn't permute twice")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class LazyTensor:
|
class LazyTensor:
|
||||||
_load: Callable[[], Tensor]
|
_load: Callable[[], Tensor]
|
||||||
|
@ -469,7 +507,9 @@ class LazyTensor:
|
||||||
|
|
||||||
def load(self) -> Tensor:
|
def load(self) -> Tensor:
|
||||||
ret = self._load()
|
ret = self._load()
|
||||||
assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
|
# Should be okay if it maps to the same numpy type?
|
||||||
|
assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
|
||||||
|
(self.data_type, ret.data_type, self.description)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def astype(self, data_type: DataType) -> 'LazyTensor':
|
def astype(self, data_type: DataType) -> 'LazyTensor':
|
||||||
|
@ -480,8 +520,8 @@ class LazyTensor:
|
||||||
return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
|
return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
|
||||||
|
|
||||||
def validate_conversion_to(self, data_type: DataType) -> None:
|
def validate_conversion_to(self, data_type: DataType) -> None:
|
||||||
if data_type == self.data_type:
|
if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
|
||||||
return
|
raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
|
||||||
|
|
||||||
|
|
||||||
LazyModel = Dict[str, LazyTensor]
|
LazyModel = Dict[str, LazyTensor]
|
||||||
|
@ -607,9 +647,7 @@ class LazyUnpickler(pickle.Unpickler):
|
||||||
info = self.zip_file.getinfo(filename)
|
info = self.zip_file.getinfo(filename)
|
||||||
|
|
||||||
def load(offset: int, elm_count: int) -> NDArray:
|
def load(offset: int, elm_count: int) -> NDArray:
|
||||||
dtype = DATA_TYPE_TO_NUMPY.get(data_type)
|
dtype = data_type.dtype
|
||||||
if dtype is None:
|
|
||||||
raise Exception("tensor stored in unsupported format")
|
|
||||||
fp = self.zip_file.open(info)
|
fp = self.zip_file.open(info)
|
||||||
fp.seek(offset * dtype.itemsize)
|
fp.seek(offset * dtype.itemsize)
|
||||||
size = elm_count * dtype.itemsize
|
size = elm_count * dtype.itemsize
|
||||||
|
@ -673,7 +711,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
|
||||||
|
|
||||||
def convert(info: Dict[str, Any]) -> LazyTensor:
|
def convert(info: Dict[str, Any]) -> LazyTensor:
|
||||||
data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
|
data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
|
||||||
numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
|
numpy_dtype = data_type.dtype
|
||||||
shape: List[int] = info['shape']
|
shape: List[int] = info['shape']
|
||||||
begin, end = info['data_offsets']
|
begin, end = info['data_offsets']
|
||||||
assert 0 <= begin <= end <= len(byte_buf)
|
assert 0 <= begin <= end <= len(byte_buf)
|
||||||
|
@ -713,23 +751,35 @@ def lazy_load_file(path: Path) -> ModelPlus:
|
||||||
In = TypeVar('In')
|
In = TypeVar('In')
|
||||||
Out = TypeVar('Out')
|
Out = TypeVar('Out')
|
||||||
|
|
||||||
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
|
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, factory: Callable = ThreadPoolExecutor) -> Iterable[Out]:
|
||||||
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
||||||
fast enough, this will stop calling `func` at some point rather than
|
fast enough, this will stop calling `func` at some point rather than
|
||||||
letting results pile up in memory. Specifically, there is a max of one
|
letting results pile up in memory. Specifically, there is a max of one
|
||||||
output value buffered per thread.'''
|
output value buffered per thread.'''
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
if concurrency < 2:
|
||||||
|
yield from map(func, iterable)
|
||||||
|
# Not reached.
|
||||||
|
iterable = iter(iterable)
|
||||||
|
with factory(max_workers = max_workers) as executor:
|
||||||
futures: List[concurrent.futures.Future[Out]] = []
|
futures: List[concurrent.futures.Future[Out]] = []
|
||||||
items_rev = list(iterable)[::-1]
|
done = False
|
||||||
for i in range(min(concurrency, len(items_rev))):
|
for _ in range(concurrency):
|
||||||
futures.append(executor.submit(func, items_rev.pop()))
|
try:
|
||||||
|
futures.append(executor.submit(func, next(iterable)))
|
||||||
|
except StopIteration:
|
||||||
|
done = True
|
||||||
|
break
|
||||||
|
|
||||||
while futures:
|
while futures:
|
||||||
result = futures.pop(0).result()
|
result = futures.pop(0).result()
|
||||||
if items_rev:
|
while not done and len(futures) < concurrency:
|
||||||
futures.append(executor.submit(func, items_rev.pop()))
|
try:
|
||||||
|
futures.append(executor.submit(func, next(iterable)))
|
||||||
|
except StopIteration:
|
||||||
|
done = True
|
||||||
|
break
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|
||||||
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||||
if params.n_vocab != vocab.vocab_size:
|
if params.n_vocab != vocab.vocab_size:
|
||||||
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
||||||
|
@ -771,6 +821,9 @@ class OutputFile:
|
||||||
if params.f_rope_freq_base:
|
if params.f_rope_freq_base:
|
||||||
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
||||||
|
|
||||||
|
if params.f_rope_scale:
|
||||||
|
self.gguf.add_rope_scale_linear(params.f_rope_scale)
|
||||||
|
|
||||||
if params.ftype:
|
if params.ftype:
|
||||||
self.gguf.add_file_type(params.ftype)
|
self.gguf.add_file_type(params.ftype)
|
||||||
|
|
||||||
|
@ -791,12 +844,11 @@ class OutputFile:
|
||||||
self.gguf.add_token_types(toktypes)
|
self.gguf.add_token_types(toktypes)
|
||||||
|
|
||||||
def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
|
def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
|
||||||
n_elements = 1
|
n_elements = int(np.prod(tensor.shape))
|
||||||
for dim in tensor.shape:
|
raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
|
||||||
n_elements *= dim
|
data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
|
||||||
data_type = DATA_TYPE_TO_NUMPY[tensor.data_type]
|
data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
|
||||||
data_nbytes = n_elements * data_type.itemsize
|
self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype)
|
||||||
self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes)
|
|
||||||
|
|
||||||
def write_meta(self) -> None:
|
def write_meta(self) -> None:
|
||||||
self.gguf.write_header_to_file()
|
self.gguf.write_header_to_file()
|
||||||
|
@ -822,7 +874,20 @@ class OutputFile:
|
||||||
of.close()
|
of.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
|
def do_item(item: Tuple[str, LazyTensor]) -> Tuple[DataType, NDArray]:
|
||||||
|
name, lazy_tensor = item
|
||||||
|
tensor = lazy_tensor.load().to_ggml()
|
||||||
|
return (lazy_tensor.data_type, tensor.ndarray)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def maybe_do_quantize(item: Tuple[DataType, NDArray]) -> NDArray:
|
||||||
|
dt, arr = item
|
||||||
|
if not isinstance(dt, QuantizedDataType):
|
||||||
|
return arr
|
||||||
|
return dt.quantize(arr)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
|
||||||
check_vocab_size(params, vocab)
|
check_vocab_size(params, vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out)
|
of = OutputFile(fname_out)
|
||||||
|
@ -838,16 +903,19 @@ class OutputFile:
|
||||||
of.write_meta()
|
of.write_meta()
|
||||||
of.write_tensor_info()
|
of.write_tensor_info()
|
||||||
|
|
||||||
def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
|
|
||||||
name, lazy_tensor = item
|
|
||||||
return lazy_tensor.load().to_ggml().ndarray
|
|
||||||
|
|
||||||
# tensor data
|
# tensor data
|
||||||
ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
|
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
|
||||||
|
if ftype == GGMLFileType.MostlyQ8_0:
|
||||||
|
ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, factory = ProcessPoolExecutor)
|
||||||
|
else:
|
||||||
|
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
|
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
|
||||||
|
elapsed = time.time() - start
|
||||||
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
||||||
padi = len(str(len(model)))
|
padi = len(str(len(model)))
|
||||||
print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
|
print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
|
||||||
of.gguf.write_tensor_data(ndarray)
|
of.gguf.write_tensor_data(ndarray)
|
||||||
|
|
||||||
of.close()
|
of.close()
|
||||||
|
@ -859,6 +927,8 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
|
||||||
return GGMLFileType.AllF32
|
return GGMLFileType.AllF32
|
||||||
if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
|
if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
|
||||||
return GGMLFileType.MostlyF16
|
return GGMLFileType.MostlyF16
|
||||||
|
if output_type_str == "q8_0":
|
||||||
|
return GGMLFileType.MostlyQ8_0
|
||||||
|
|
||||||
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
|
name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
|
||||||
|
|
||||||
|
@ -905,7 +975,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||||
print(f"skipping tensor {name_new}")
|
print(f"skipping tensor {name_new}")
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type} | {lazy_tensor.shape}")
|
print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
|
||||||
out[name_new] = lazy_tensor
|
out[name_new] = lazy_tensor
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
@ -1010,6 +1080,7 @@ def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
|
||||||
namestr = {
|
namestr = {
|
||||||
GGMLFileType.AllF32: "f32",
|
GGMLFileType.AllF32: "f32",
|
||||||
GGMLFileType.MostlyF16: "f16",
|
GGMLFileType.MostlyF16: "f16",
|
||||||
|
GGMLFileType.MostlyQ8_0:"q8_0",
|
||||||
}[file_type]
|
}[file_type]
|
||||||
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
|
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
|
||||||
if ret in model_paths:
|
if ret in model_paths:
|
||||||
|
@ -1033,12 +1104,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||||
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
||||||
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
||||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||||
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
|
parser.add_argument("--outtype", choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
||||||
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
||||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
|
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
|
||||||
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
|
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
|
||||||
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
||||||
|
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
||||||
args = parser.parse_args(args_in)
|
args = parser.parse_args(args_in)
|
||||||
|
|
||||||
if args.dump_single:
|
if args.dump_single:
|
||||||
|
@ -1060,6 +1132,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||||
params.ftype = {
|
params.ftype = {
|
||||||
"f32": GGMLFileType.AllF32,
|
"f32": GGMLFileType.AllF32,
|
||||||
"f16": GGMLFileType.MostlyF16,
|
"f16": GGMLFileType.MostlyF16,
|
||||||
|
"q8_0": GGMLFileType.MostlyQ8_0,
|
||||||
}[args.outtype]
|
}[args.outtype]
|
||||||
|
|
||||||
print(f"params = {params}")
|
print(f"params = {params}")
|
||||||
|
@ -1091,7 +1164,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||||
params.ftype = ftype
|
params.ftype = ftype
|
||||||
print(f"Writing {outfile}, format {ftype}")
|
print(f"Writing {outfile}, format {ftype}")
|
||||||
|
|
||||||
OutputFile.write_all(outfile, params, model, vocab)
|
OutputFile.write_all(outfile, ftype, params, model, vocab, concurrency = args.concurrency)
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,7 @@ else()
|
||||||
add_subdirectory(simple)
|
add_subdirectory(simple)
|
||||||
add_subdirectory(embd-input)
|
add_subdirectory(embd-input)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
|
add_subdirectory(beam_search)
|
||||||
if (LLAMA_METAL)
|
if (LLAMA_METAL)
|
||||||
add_subdirectory(metal)
|
add_subdirectory(metal)
|
||||||
endif()
|
endif()
|
||||||
|
|
8
examples/beam_search/CMakeLists.txt
Normal file
8
examples/beam_search/CMakeLists.txt
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
set(TARGET beam_search)
|
||||||
|
add_executable(${TARGET} beam_search.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
if(TARGET BUILD_INFO)
|
||||||
|
add_dependencies(${TARGET} BUILD_INFO)
|
||||||
|
endif()
|
188
examples/beam_search/beam_search.cpp
Normal file
188
examples/beam_search/beam_search.cpp
Normal file
|
@ -0,0 +1,188 @@
|
||||||
|
#ifndef _GNU_SOURCE
|
||||||
|
#define _GNU_SOURCE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "build-info.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <ctime>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
|
#include <signal.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#elif defined (_WIN32)
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#define NOMINMAX
|
||||||
|
#include <windows.h>
|
||||||
|
#include <signal.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Used for debugging to print out beam tokens.
|
||||||
|
struct ostream_beam_view {
|
||||||
|
llama_context * ctx;
|
||||||
|
llama_beam_view beam_view;
|
||||||
|
};
|
||||||
|
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
|
||||||
|
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
||||||
|
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
||||||
|
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
||||||
|
}
|
||||||
|
return os << ')';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Put here anything you want back in beam_search_callback().
|
||||||
|
struct beam_search_callback_data {
|
||||||
|
llama_context * ctx;
|
||||||
|
std::vector<llama_token> response;
|
||||||
|
};
|
||||||
|
|
||||||
|
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
||||||
|
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
||||||
|
bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
|
||||||
|
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function matching type llama_beam_search_callback_fn_t.
|
||||||
|
// Custom callback example is called each time the beams lengths increase:
|
||||||
|
// * Show progress by printing ',' following by number of convergent beam tokens if any.
|
||||||
|
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
||||||
|
// This is also called when the stop condition is met.
|
||||||
|
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
||||||
|
void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
||||||
|
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
||||||
|
// Mark beams as EOS as needed.
|
||||||
|
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||||
|
llama_beam_view& beam_view = beams_state.beam_views[i];
|
||||||
|
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
|
||||||
|
beam_view.eob = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf(","); // Show progress
|
||||||
|
if (const size_t n = beams_state.common_prefix_length) {
|
||||||
|
callback_data.response.resize(callback_data.response.size() + n);
|
||||||
|
assert(0u < beams_state.n_beams);
|
||||||
|
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
||||||
|
std::copy(tokens, tokens + n, callback_data.response.end() - n);
|
||||||
|
printf("%lu", n);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
#if 1 // DEBUG: print current beams for this iteration
|
||||||
|
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
|
||||||
|
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||||
|
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv)
|
||||||
|
{
|
||||||
|
gpt_params params;
|
||||||
|
//params.n_gpu_layers = 200;
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Print help :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
if ( argc < 2 || argv[1][0] == '-' )
|
||||||
|
{
|
||||||
|
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
|
||||||
|
return 1 ;
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Load parameters :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
params.model = argv[1];
|
||||||
|
|
||||||
|
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
|
||||||
|
|
||||||
|
if ( argc > 3 )
|
||||||
|
{
|
||||||
|
params.prompt = argv[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( params.prompt.empty() )
|
||||||
|
{
|
||||||
|
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Init LLM :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
||||||
|
|
||||||
|
if ( model == NULL )
|
||||||
|
{
|
||||||
|
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Tokenize the prompt :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
|
const size_t max_context_size = llama_n_ctx( ctx );
|
||||||
|
const size_t max_tokens_list_size = max_context_size - 4 ;
|
||||||
|
|
||||||
|
if (tokens_list.size() > max_tokens_list_size)
|
||||||
|
{
|
||||||
|
fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
|
||||||
|
__func__ , tokens_list.size() , max_tokens_list_size );
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf( stderr, "\n\n" );
|
||||||
|
|
||||||
|
// Print the tokens from the prompt :
|
||||||
|
|
||||||
|
for( auto id : tokens_list )
|
||||||
|
{
|
||||||
|
std::cout << llama_token_to_piece(ctx, id);
|
||||||
|
}
|
||||||
|
std::cout << std::flush;
|
||||||
|
|
||||||
|
int n_past = llama_get_kv_cache_token_count(ctx);
|
||||||
|
if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
n_past += tokens_list.size();
|
||||||
|
|
||||||
|
beam_search_callback_data callback_data{ctx, {}};
|
||||||
|
size_t const beam_width = static_cast<size_t>(params.n_beams);
|
||||||
|
int const n_predict = 256;
|
||||||
|
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);
|
||||||
|
|
||||||
|
std::cout << "\n\n";
|
||||||
|
for (llama_token const token_id : callback_data.response) {
|
||||||
|
std::cout << llama_token_to_piece(ctx,token_id);
|
||||||
|
}
|
||||||
|
std::cout << std::endl;
|
||||||
|
|
||||||
|
llama_free( ctx );
|
||||||
|
llama_free_model( model );
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -12,18 +12,14 @@ usage: ./convert-llama2c-to-ggml [options]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'tokenizer.bin')
|
--copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf')
|
||||||
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
|
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
|
||||||
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
|
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
|
||||||
```
|
```
|
||||||
|
|
||||||
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
||||||
|
|
||||||
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin`
|
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
||||||
|
|
||||||
For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually:
|
|
||||||
|
|
||||||
`$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin`
|
|
||||||
|
|
||||||
Now you can use the model with a command like:
|
Now you can use the model with a command like:
|
||||||
|
|
||||||
|
|
|
@ -10,9 +10,48 @@
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
#include <sstream>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
// GGUF keys & tensor names.
|
||||||
|
|
||||||
|
#define KV_GENERAL_ARCHITECTURE "general.architecture"
|
||||||
|
#define KV_GENERAL_NAME "general.name"
|
||||||
|
|
||||||
|
#define KV_TOKENIZER_MODEL "tokenizer.ggml.model"
|
||||||
|
#define KV_TOKENIZER_LIST "tokenizer.ggml.tokens"
|
||||||
|
#define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type"
|
||||||
|
#define KV_TOKENIZER_SCORES "tokenizer.ggml.scores"
|
||||||
|
#define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id"
|
||||||
|
#define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id"
|
||||||
|
#define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id"
|
||||||
|
#define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id"
|
||||||
|
#define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id"
|
||||||
|
#define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json"
|
||||||
|
|
||||||
|
#define KV_CONTEXT_LENGTH "llama.context_length"
|
||||||
|
#define KV_EMBEDDING_LENGTH "llama.embedding_length"
|
||||||
|
#define KV_BLOCK_COUNT "llama.block_count"
|
||||||
|
#define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length"
|
||||||
|
#define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count"
|
||||||
|
#define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv"
|
||||||
|
#define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon"
|
||||||
|
#define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count"
|
||||||
|
|
||||||
|
#define TN_TOKEN_EMBD "token_embd.weight"
|
||||||
|
#define TN_OUTPUT_NORM "output_norm.weight"
|
||||||
|
#define TN_OUTPUT "output.weight"
|
||||||
|
#define TN_ATTN_NORM "blk.%d.attn_norm.weight"
|
||||||
|
#define TN_ATTN_Q "blk.%d.attn_q.weight"
|
||||||
|
#define TN_ATTN_K "blk.%d.attn_k.weight"
|
||||||
|
#define TN_ATTN_V "blk.%d.attn_v.weight"
|
||||||
|
#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
|
||||||
|
#define TN_FFN_NORM "blk.%d.ffn_norm.weight"
|
||||||
|
#define TN_FFN_GATE "blk.%d.ffn_gate.weight"
|
||||||
|
#define TN_FFN_DOWN "blk.%d.ffn_down.weight"
|
||||||
|
#define TN_FFN_UP "blk.%d.ffn_up.weight"
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
@ -20,6 +59,11 @@
|
||||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||||
#define LLAMA_FILE_VERSION_GGJT_V3 3
|
#define LLAMA_FILE_VERSION_GGJT_V3 3
|
||||||
|
|
||||||
|
#define TOKENIZER_NAME "llama"
|
||||||
|
#define UNKNOWN_TOKEN_ID 0
|
||||||
|
#define BOS_TOKEN_ID 1
|
||||||
|
#define EOS_TOKEN_ID 2
|
||||||
|
|
||||||
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
|
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int dim; // transformer dimension
|
int dim; // transformer dimension
|
||||||
|
@ -183,6 +227,7 @@ struct my_llama_hparams {
|
||||||
uint32_t n_vocab = 32000;
|
uint32_t n_vocab = 32000;
|
||||||
uint32_t n_ctx = 512; // this is provided as user input?
|
uint32_t n_ctx = 512; // this is provided as user input?
|
||||||
uint32_t n_embd = 4096;
|
uint32_t n_embd = 4096;
|
||||||
|
uint32_t n_ff = 11008;
|
||||||
uint32_t n_mult = 4;
|
uint32_t n_mult = 4;
|
||||||
uint32_t n_head = 32;
|
uint32_t n_head = 32;
|
||||||
uint32_t n_layer = 32;
|
uint32_t n_layer = 32;
|
||||||
|
@ -214,6 +259,8 @@ struct my_llama_layer {
|
||||||
struct my_llama_model {
|
struct my_llama_model {
|
||||||
struct ggml_context * ctx = NULL;
|
struct ggml_context * ctx = NULL;
|
||||||
|
|
||||||
|
std::string name;
|
||||||
|
|
||||||
my_llama_hparams hparams;
|
my_llama_hparams hparams;
|
||||||
|
|
||||||
struct ggml_tensor * tok_embeddings;
|
struct ggml_tensor * tok_embeddings;
|
||||||
|
@ -276,18 +323,13 @@ struct train_params {
|
||||||
int mem_compute1_gb;
|
int mem_compute1_gb;
|
||||||
};
|
};
|
||||||
|
|
||||||
uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
|
|
||||||
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
|
|
||||||
return n_ff;
|
|
||||||
}
|
|
||||||
|
|
||||||
void print_params(struct my_llama_hparams * params) {
|
void print_params(struct my_llama_hparams * params) {
|
||||||
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
||||||
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
||||||
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
||||||
printf("%s: n_mult: %d\n", __func__, params->n_mult);
|
printf("%s: n_mult: %d\n", __func__, params->n_mult);
|
||||||
printf("%s: n_head: %d\n", __func__, params->n_head);
|
printf("%s: n_head: %d\n", __func__, params->n_head);
|
||||||
printf("%s: n_ff: %d\n", __func__, get_n_ff(params));
|
printf("%s: n_ff: %d\n", __func__, params->n_ff);
|
||||||
printf("%s: n_layer: %d\n", __func__, params->n_layer);
|
printf("%s: n_layer: %d\n", __func__, params->n_layer);
|
||||||
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
||||||
}
|
}
|
||||||
|
@ -299,7 +341,7 @@ void init_model(struct my_llama_model * model) {
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
const uint32_t n_vocab = hparams.n_vocab;
|
const uint32_t n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
const uint32_t n_ff = get_n_ff(&hparams);
|
const uint32_t n_ff = hparams.n_ff;
|
||||||
struct ggml_context * ctx = model->ctx;
|
struct ggml_context * ctx = model->ctx;
|
||||||
|
|
||||||
model->train_its = 0;
|
model->train_its = 0;
|
||||||
|
@ -481,21 +523,6 @@ struct llama_file {
|
||||||
return std::string(chars.data(), len);
|
return std::string(chars.data(), len);
|
||||||
}
|
}
|
||||||
|
|
||||||
void write_raw(const void * ptr, size_t size) {
|
|
||||||
if (size == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
errno = 0;
|
|
||||||
size_t ret = std::fwrite(ptr, size, 1, fp);
|
|
||||||
if (ret != 1) {
|
|
||||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void write_u32(std::uint32_t val) {
|
|
||||||
write_raw(&val, sizeof(val));
|
|
||||||
}
|
|
||||||
|
|
||||||
~llama_file() {
|
~llama_file() {
|
||||||
if (fp) {
|
if (fp) {
|
||||||
std::fclose(fp);
|
std::fclose(fp);
|
||||||
|
@ -503,30 +530,6 @@ struct llama_file {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
|
|
||||||
if (tensor == NULL) {
|
|
||||||
file->write_u32(0);
|
|
||||||
file->write_u32(0);
|
|
||||||
file->write_u32(GGML_TYPE_F32);
|
|
||||||
file->seek((0-file->tell()) & 31, SEEK_CUR);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const char * name = ggml_get_name(tensor);
|
|
||||||
uint32_t name_len = strlen(name);
|
|
||||||
uint32_t nd = tensor->n_dims;
|
|
||||||
uint32_t ne[4] = { (uint32_t)tensor->ne[0],
|
|
||||||
(uint32_t)tensor->ne[1],
|
|
||||||
(uint32_t)tensor->ne[2],
|
|
||||||
(uint32_t)tensor->ne[3] };
|
|
||||||
file->write_u32(nd);
|
|
||||||
file->write_u32(name_len);
|
|
||||||
file->write_u32(tensor->type);
|
|
||||||
file->write_raw(ne, sizeof(ne[0]) * nd);
|
|
||||||
file->write_raw(name, name_len);
|
|
||||||
file->seek((0-file->tell()) & 31, SEEK_CUR);
|
|
||||||
file->write_raw(tensor->data, ggml_nbytes(tensor));
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_ggml_file(const char *filename) {
|
bool is_ggml_file(const char *filename) {
|
||||||
llama_file file(filename, "rb");
|
llama_file file(filename, "rb");
|
||||||
if (file.size < 4) {
|
if (file.size < 4) {
|
||||||
|
@ -536,48 +539,96 @@ bool is_ggml_file(const char *filename) {
|
||||||
return magic == GGUF_MAGIC;
|
return magic == GGUF_MAGIC;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string llama_escape_whitespaces(const std::string& text) {
|
||||||
|
std::ostringstream out;
|
||||||
|
for (char c : text) {
|
||||||
|
if (c == ' ') out << "\xe2\x96\x81";
|
||||||
|
else out << c;
|
||||||
|
}
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||||
#pragma message("TODO: implement reading vocabulary using gguf")
|
if (is_ggml_file(filename)) {
|
||||||
// // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
|
struct ggml_context * ctx_data = NULL;
|
||||||
// if (is_ggml_file(filename)) {
|
|
||||||
//
|
struct gguf_init_params params = {
|
||||||
// struct llama_context_params llama_params = llama_context_default_params();
|
/*.no_alloc = */ false,
|
||||||
// llama_params.vocab_only = true;
|
/*.ctx = */ &ctx_data,
|
||||||
//
|
};
|
||||||
// struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
|
|
||||||
// struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
|
struct gguf_context * ctx = gguf_init_from_file(filename, params);
|
||||||
//
|
GGML_ASSERT(ctx != NULL);
|
||||||
// const int n_vocab = llama_n_vocab(lctx);
|
|
||||||
// vocab->id_to_token.resize(n_vocab);
|
const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL);
|
||||||
// for (int i=0; i<n_vocab; ++i) {
|
GGML_ASSERT(model_idx >= 0);
|
||||||
// vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
|
std::string tokenizer_name = gguf_get_val_str(ctx, model_idx);
|
||||||
// vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
|
GGML_ASSERT(tokenizer_name == TOKENIZER_NAME);
|
||||||
// vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
|
|
||||||
// vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
|
const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST);
|
||||||
// }
|
GGML_ASSERT(token_idx >= 0);
|
||||||
// llama_free(lctx);
|
|
||||||
// llama_free_model(lmodel);
|
const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES);
|
||||||
// } else
|
GGML_ASSERT(score_idx >= 0);
|
||||||
{ // assume llama2.c vocabulary
|
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||||||
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
|
|
||||||
|
const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE);
|
||||||
|
GGML_ASSERT(toktype_idx >= 0);
|
||||||
|
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||||
|
|
||||||
|
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
||||||
|
|
||||||
|
vocab->id_to_token.resize(n_vocab);
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
||||||
|
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
||||||
|
|
||||||
|
vocab->token_to_id[word] = i;
|
||||||
|
|
||||||
|
auto & token_data = vocab->id_to_token[i];
|
||||||
|
token_data.text = std::move(word);
|
||||||
|
token_data.score = scores[i];
|
||||||
|
token_data.type = (llama_token_type) toktypes[i];
|
||||||
|
}
|
||||||
|
ggml_free(ctx_data);
|
||||||
|
gguf_free(ctx);
|
||||||
|
} else {
|
||||||
|
// assume llama2.c vocabulary
|
||||||
|
printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
|
||||||
llama_file file(filename, "rb");
|
llama_file file(filename, "rb");
|
||||||
const int n_vocab = config->vocab_size;
|
const int n_vocab = config->vocab_size;
|
||||||
/* uint32_t max_token_length = */ file.read_u32(); // unused
|
/* uint32_t max_token_length = */ file.read_u32(); // unused
|
||||||
vocab->id_to_token.resize(n_vocab);
|
vocab->id_to_token.resize(n_vocab);
|
||||||
for (int i=0; i<n_vocab; ++i) {
|
for (llama_vocab::id id=0; id<n_vocab; ++id) {
|
||||||
float_t score = file.read_f32();
|
float_t score = file.read_f32();
|
||||||
uint32_t len = file.read_u32();
|
uint32_t len = file.read_u32();
|
||||||
std::string text = file.read_string(len);
|
std::string text = file.read_string(len);
|
||||||
// Special-case handling of <0xXX> single byte tokens.
|
|
||||||
char byte_val;
|
unsigned char byte_val;
|
||||||
if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
|
llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
|
||||||
char cstr[2] = { byte_val, 0 };
|
if (id == UNKNOWN_TOKEN_ID) {
|
||||||
text = cstr;
|
text = "<unk>";
|
||||||
|
type = LLAMA_TOKEN_TYPE_UNKNOWN;
|
||||||
|
} else if (id == BOS_TOKEN_ID) {
|
||||||
|
text = "<s>";
|
||||||
|
type = LLAMA_TOKEN_TYPE_CONTROL;
|
||||||
|
} else if (id == EOS_TOKEN_ID) {
|
||||||
|
text = "</s>";
|
||||||
|
type = LLAMA_TOKEN_TYPE_CONTROL;
|
||||||
|
} else if (text.empty()) {
|
||||||
|
type = LLAMA_TOKEN_TYPE_CONTROL;
|
||||||
|
} else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
|
||||||
|
// Text of byte tokens is already in the expected format.
|
||||||
|
type = LLAMA_TOKEN_TYPE_BYTE;
|
||||||
|
} else {
|
||||||
|
type = LLAMA_TOKEN_TYPE_NORMAL;
|
||||||
}
|
}
|
||||||
vocab->id_to_token[i].text = text;
|
text = llama_escape_whitespaces(text);
|
||||||
vocab->id_to_token[i].score = score;
|
|
||||||
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
|
vocab->id_to_token[id].text = text;
|
||||||
vocab->token_to_id.emplace(text, i);
|
vocab->id_to_token[id].score = score;
|
||||||
|
vocab->id_to_token[id].type = type;
|
||||||
|
vocab->token_to_id.emplace(text, id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -619,33 +670,6 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar
|
||||||
}
|
}
|
||||||
|
|
||||||
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
|
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
|
||||||
struct llama_file file(filename, "wb");
|
|
||||||
if (file.fp == NULL) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma message("TODO: implement file saving using gguf")
|
|
||||||
// write_magic
|
|
||||||
file.write_u32(LLAMA_FILE_MAGIC_GGJT); // magic
|
|
||||||
file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version
|
|
||||||
// write_hparams
|
|
||||||
file.write_u32(model->hparams.n_vocab);
|
|
||||||
file.write_u32(model->hparams.n_embd);
|
|
||||||
file.write_u32(model->hparams.n_mult);
|
|
||||||
file.write_u32(model->hparams.n_head);
|
|
||||||
file.write_u32(model->hparams.n_layer);
|
|
||||||
file.write_u32(model->hparams.n_rot);
|
|
||||||
file.write_u32(LLAMA_FTYPE_ALL_F32);
|
|
||||||
|
|
||||||
// write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
|
|
||||||
uint32_t n_vocab = model->hparams.n_vocab;
|
|
||||||
for (uint32_t i = 0; i < n_vocab; i++) {
|
|
||||||
const auto & token_data = vocab->id_to_token.at(i);
|
|
||||||
file.write_u32((uint32_t) token_data.text.size());
|
|
||||||
file.write_raw(token_data.text.data(), token_data.text.size());
|
|
||||||
file.write_raw(&token_data.score, sizeof(token_data.score));
|
|
||||||
}
|
|
||||||
|
|
||||||
// stuff AK weights into GG weights one by one.
|
// stuff AK weights into GG weights one by one.
|
||||||
// w->token_embedding_table -> model->tok_embeddings
|
// w->token_embedding_table -> model->tok_embeddings
|
||||||
// float* -> struct ggml_tensor
|
// float* -> struct ggml_tensor
|
||||||
|
@ -658,8 +682,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
||||||
// for rms-att-weight
|
// for rms-att-weight
|
||||||
int row_length = model->hparams.n_embd;
|
int row_length = model->hparams.n_embd;
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
//int n_ff = model->hparams.n_embd;
|
int n_ff = model->hparams.n_ff;
|
||||||
int n_ff = get_n_ff(&hparams);
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
||||||
auto & layer = model->layers[i];
|
auto & layer = model->layers[i];
|
||||||
|
@ -677,28 +700,91 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
||||||
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
||||||
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct gguf_context * ctx = gguf_init_empty();
|
||||||
|
|
||||||
|
std::vector<const char*> tokens;
|
||||||
|
std::vector<float> scores;
|
||||||
|
std::vector<llama_token_type> token_types;
|
||||||
|
for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
|
||||||
|
tokens.push_back(token_data.text.c_str());
|
||||||
|
scores.push_back(token_data.score);
|
||||||
|
token_types.push_back(token_data.type);
|
||||||
|
}
|
||||||
|
gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size());
|
||||||
|
gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size());
|
||||||
|
gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size());
|
||||||
|
|
||||||
|
gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
|
||||||
|
|
||||||
|
gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama");
|
||||||
|
gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama");
|
||||||
|
|
||||||
|
// special tokens
|
||||||
|
gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
|
||||||
|
gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
|
||||||
|
gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
|
||||||
|
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
|
||||||
|
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
|
||||||
|
|
||||||
|
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
|
||||||
|
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
|
||||||
|
gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
|
||||||
|
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
|
||||||
|
// n_head_kv is optional, default to n_head
|
||||||
|
// gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
|
||||||
|
gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
|
||||||
|
gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
|
||||||
|
gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
|
||||||
|
|
||||||
// write tensors
|
// write tensors
|
||||||
write_tensor(&file, model->tok_embeddings);
|
ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD);
|
||||||
write_tensor(&file, model->norm);
|
gguf_add_tensor(ctx, model->tok_embeddings);
|
||||||
write_tensor(&file, model->output); // ?
|
|
||||||
|
ggml_set_name(model->norm, TN_OUTPUT_NORM);
|
||||||
|
gguf_add_tensor(ctx, model->norm);
|
||||||
|
|
||||||
|
ggml_set_name(model->output, TN_OUTPUT);
|
||||||
|
gguf_add_tensor(ctx, model->output);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
|
||||||
auto & layer = model->layers[i];
|
auto & layer = model->layers[i];
|
||||||
|
|
||||||
write_tensor(&file, layer.attention_norm);
|
ggml_format_name(layer.wq, TN_ATTN_Q, i);
|
||||||
write_tensor(&file, layer.wq);
|
gguf_add_tensor(ctx, layer.wq);
|
||||||
write_tensor(&file, layer.wk);
|
|
||||||
write_tensor(&file, layer.wv);
|
ggml_format_name(layer.wk, TN_ATTN_K, i);
|
||||||
write_tensor(&file, layer.wo);
|
gguf_add_tensor(ctx, layer.wk);
|
||||||
write_tensor(&file, layer.ffn_norm);
|
|
||||||
write_tensor(&file, layer.w1);
|
ggml_format_name(layer.wv, TN_ATTN_V, i);
|
||||||
write_tensor(&file, layer.w2);
|
gguf_add_tensor(ctx, layer.wv);
|
||||||
write_tensor(&file, layer.w3);
|
|
||||||
|
ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i);
|
||||||
|
gguf_add_tensor(ctx, layer.wo);
|
||||||
|
|
||||||
|
ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i);
|
||||||
|
gguf_add_tensor(ctx, layer.attention_norm);
|
||||||
|
|
||||||
|
ggml_format_name(layer.w1, TN_FFN_GATE, i);
|
||||||
|
gguf_add_tensor(ctx, layer.w1);
|
||||||
|
|
||||||
|
ggml_format_name(layer.w2, TN_FFN_DOWN, i);
|
||||||
|
gguf_add_tensor(ctx, layer.w2);
|
||||||
|
|
||||||
|
ggml_format_name(layer.w3, TN_FFN_UP, i);
|
||||||
|
gguf_add_tensor(ctx, layer.w3);
|
||||||
|
|
||||||
|
ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i);
|
||||||
|
gguf_add_tensor(ctx, layer.ffn_norm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gguf_write_to_file(ctx, filename, false);
|
||||||
|
gguf_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct train_params get_default_train_params() {
|
struct train_params get_default_train_params() {
|
||||||
struct train_params params;
|
struct train_params params;
|
||||||
params.fn_vocab_model = "tokenizer.bin";
|
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
|
||||||
params.fn_llama2c_output_model = "ak_llama_model.bin";
|
params.fn_llama2c_output_model = "ak_llama_model.bin";
|
||||||
params.fn_train_data = "shakespeare.txt";
|
params.fn_train_data = "shakespeare.txt";
|
||||||
params.fn_checkpoint_in = "checkpoint.bin";
|
params.fn_checkpoint_in = "checkpoint.bin";
|
||||||
|
@ -751,7 +837,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "options:\n");
|
fprintf(stderr, "options:\n");
|
||||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||||
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
|
fprintf(stderr, " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
|
||||||
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
|
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
|
||||||
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
|
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
@ -812,6 +898,14 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string basename(const std::string &path) {
|
||||||
|
size_t pos = path.find_last_of("/");
|
||||||
|
if (pos == std::string::npos) {
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
return path.substr(pos + 1);
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
struct train_params params = get_default_train_params();
|
struct train_params params = get_default_train_params();
|
||||||
if (!params_parse(argc, argv, ¶ms)) {
|
if (!params_parse(argc, argv, ¶ms)) {
|
||||||
|
@ -840,6 +934,7 @@ int main(int argc, char ** argv) {
|
||||||
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
|
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
|
||||||
model.hparams.n_ctx = params.n_ctx;
|
model.hparams.n_ctx = params.n_ctx;
|
||||||
model.hparams.n_embd = config.dim; //params.n_embd;
|
model.hparams.n_embd = config.dim; //params.n_embd;
|
||||||
|
model.hparams.n_ff = config.hidden_dim;
|
||||||
model.hparams.n_mult = 32;//params.n_mult;
|
model.hparams.n_mult = 32;//params.n_mult;
|
||||||
model.hparams.n_head = config.n_heads; //params.n_head;
|
model.hparams.n_head = config.n_heads; //params.n_head;
|
||||||
model.hparams.n_layer = config.n_layers; //params.n_layer;
|
model.hparams.n_layer = config.n_layers; //params.n_layer;
|
||||||
|
@ -853,6 +948,7 @@ int main(int argc, char ** argv) {
|
||||||
model.ctx = ggml_init(lcparams);
|
model.ctx = ggml_init(lcparams);
|
||||||
|
|
||||||
init_model(&model);
|
init_model(&model);
|
||||||
|
model.name = basename(params.fn_llama2c_model);
|
||||||
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
|
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
|
||||||
|
|
||||||
printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
|
printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
|
||||||
|
|
|
@ -214,7 +214,7 @@ const char * sampling(struct MyModel * mymodel) {
|
||||||
if (id == llama_token_eos(ctx)) {
|
if (id == llama_token_eos(ctx)) {
|
||||||
ret = "</s>";
|
ret = "</s>";
|
||||||
} else {
|
} else {
|
||||||
ret = llama_token_to_str(ctx, id);
|
ret = llama_token_to_piece(ctx, id);
|
||||||
}
|
}
|
||||||
eval_id(mymodel, id);
|
eval_id(mymodel, id);
|
||||||
return ret.c_str();
|
return ret.c_str();
|
||||||
|
|
|
@ -56,9 +56,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
// Add a space in front of the first character to match OG llama tokenizer behavior
|
|
||||||
params.prompt.insert(0, 1, ' ');
|
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
|
@ -67,7 +64,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
|
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,6 +30,9 @@ bool gguf_ex_write(const std::string & fname) {
|
||||||
gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
|
gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
|
||||||
gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
|
gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
|
||||||
gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
|
gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
|
||||||
|
gguf_set_val_u64 (ctx, "some.parameter.uint64", 0x123456789abcdef0ull);
|
||||||
|
gguf_set_val_i64 (ctx, "some.parameter.int64", -0x123456789abcdef1ll);
|
||||||
|
gguf_set_val_f64 (ctx, "some.parameter.float64", 0.1234567890123456789);
|
||||||
gguf_set_val_bool(ctx, "some.parameter.bool", true);
|
gguf_set_val_bool(ctx, "some.parameter.bool", true);
|
||||||
gguf_set_val_str (ctx, "some.parameter.string", "hello world");
|
gguf_set_val_str (ctx, "some.parameter.string", "hello world");
|
||||||
|
|
||||||
|
|
|
@ -441,6 +441,8 @@ struct test {
|
||||||
static const std::string gpu_info;
|
static const std::string gpu_info;
|
||||||
std::string model_filename;
|
std::string model_filename;
|
||||||
std::string model_type;
|
std::string model_type;
|
||||||
|
uint64_t model_size;
|
||||||
|
uint64_t model_n_params;
|
||||||
int n_batch;
|
int n_batch;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
bool f32_kv;
|
bool f32_kv;
|
||||||
|
@ -457,8 +459,10 @@ struct test {
|
||||||
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
|
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
|
||||||
model_filename = inst.model;
|
model_filename = inst.model;
|
||||||
char buf[128];
|
char buf[128];
|
||||||
llama_model_type(lmodel, buf, sizeof(buf));
|
llama_model_desc(lmodel, buf, sizeof(buf));
|
||||||
model_type = buf;
|
model_type = buf;
|
||||||
|
model_size = llama_model_size(lmodel);
|
||||||
|
model_n_params = llama_model_n_params(lmodel);
|
||||||
n_batch = inst.n_batch;
|
n_batch = inst.n_batch;
|
||||||
n_threads = inst.n_threads;
|
n_threads = inst.n_threads;
|
||||||
f32_kv = inst.f32_kv;
|
f32_kv = inst.f32_kv;
|
||||||
|
@ -524,7 +528,7 @@ struct test {
|
||||||
"build_commit", "build_number",
|
"build_commit", "build_number",
|
||||||
"cuda", "opencl", "metal", "gpu_blas", "blas",
|
"cuda", "opencl", "metal", "gpu_blas", "blas",
|
||||||
"cpu_info", "gpu_info",
|
"cpu_info", "gpu_info",
|
||||||
"model_filename", "model_type",
|
"model_filename", "model_type", "model_size", "model_n_params",
|
||||||
"n_batch", "n_threads", "f16_kv",
|
"n_batch", "n_threads", "f16_kv",
|
||||||
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
|
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
|
||||||
"n_prompt", "n_gen", "test_time",
|
"n_prompt", "n_gen", "test_time",
|
||||||
|
@ -538,6 +542,7 @@ struct test {
|
||||||
|
|
||||||
static field_type get_field_type(const std::string & field) {
|
static field_type get_field_type(const std::string & field) {
|
||||||
if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
|
if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
|
||||||
|
field == "model_size" || field == "model_n_params" ||
|
||||||
field == "n_gpu_layers" || field == "main_gpu" ||
|
field == "n_gpu_layers" || field == "main_gpu" ||
|
||||||
field == "n_prompt" || field == "n_gen" ||
|
field == "n_prompt" || field == "n_gen" ||
|
||||||
field == "avg_ns" || field == "stddev_ns") {
|
field == "avg_ns" || field == "stddev_ns") {
|
||||||
|
@ -573,7 +578,7 @@ struct test {
|
||||||
build_commit, std::to_string(build_number),
|
build_commit, std::to_string(build_number),
|
||||||
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type,
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
|
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
|
||||||
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
|
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
|
||||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||||
|
@ -709,8 +714,15 @@ struct markdown_printer : public printer {
|
||||||
return -30;
|
return -30;
|
||||||
}
|
}
|
||||||
if (field == "t/s") {
|
if (field == "t/s") {
|
||||||
return 15;
|
return 16;
|
||||||
}
|
}
|
||||||
|
if (field == "size" || field == "params") {
|
||||||
|
return 10;
|
||||||
|
}
|
||||||
|
if (field == "n_gpu_layers") {
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
int width = std::max((int)field.length(), 10);
|
int width = std::max((int)field.length(), 10);
|
||||||
|
|
||||||
if (test::get_field_type(field) == test::STRING) {
|
if (test::get_field_type(field) == test::STRING) {
|
||||||
|
@ -719,9 +731,28 @@ struct markdown_printer : public printer {
|
||||||
return width;
|
return width;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string get_field_display_name(const std::string & field) {
|
||||||
|
if (field == "n_gpu_layers") {
|
||||||
|
return "ngl";
|
||||||
|
}
|
||||||
|
if (field == "n_threads") {
|
||||||
|
return "threads";
|
||||||
|
}
|
||||||
|
if (field == "mul_mat_q") {
|
||||||
|
return "mmq";
|
||||||
|
}
|
||||||
|
if (field == "tensor_split") {
|
||||||
|
return "ts";
|
||||||
|
}
|
||||||
|
return field;
|
||||||
|
}
|
||||||
|
|
||||||
void print_header(const cmd_params & params) override {
|
void print_header(const cmd_params & params) override {
|
||||||
// select fields to print
|
// select fields to print
|
||||||
fields = { "model", "backend" };
|
fields.push_back("model");
|
||||||
|
fields.push_back("size");
|
||||||
|
fields.push_back("params");
|
||||||
|
fields.push_back("backend");
|
||||||
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
|
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
|
||||||
if (!is_cpu_backend) {
|
if (!is_cpu_backend) {
|
||||||
fields.push_back("n_gpu_layers");
|
fields.push_back("n_gpu_layers");
|
||||||
|
@ -752,7 +783,7 @@ struct markdown_printer : public printer {
|
||||||
|
|
||||||
fprintf(fout, "|");
|
fprintf(fout, "|");
|
||||||
for (const auto & field : fields) {
|
for (const auto & field : fields) {
|
||||||
fprintf(fout, " %*s |", get_field_width(field), field.c_str());
|
fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
|
||||||
}
|
}
|
||||||
fprintf(fout, "\n");
|
fprintf(fout, "\n");
|
||||||
fprintf(fout, "|");
|
fprintf(fout, "|");
|
||||||
|
@ -769,12 +800,26 @@ struct markdown_printer : public printer {
|
||||||
fprintf(fout, "|");
|
fprintf(fout, "|");
|
||||||
for (const auto & field : fields) {
|
for (const auto & field : fields) {
|
||||||
std::string value;
|
std::string value;
|
||||||
|
char buf[128];
|
||||||
if (field == "model") {
|
if (field == "model") {
|
||||||
value = t.model_type;
|
value = t.model_type;
|
||||||
|
} else if (field == "size") {
|
||||||
|
if (t.model_size < 1024*1024*1024) {
|
||||||
|
snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
|
||||||
|
} else {
|
||||||
|
snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
|
||||||
|
}
|
||||||
|
value = buf;
|
||||||
|
} else if (field == "params") {
|
||||||
|
if (t.model_n_params < 1000*1000*1000) {
|
||||||
|
snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
|
||||||
|
} else {
|
||||||
|
snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
|
||||||
|
}
|
||||||
|
value = buf;
|
||||||
} else if (field == "backend") {
|
} else if (field == "backend") {
|
||||||
value = test::get_backend();
|
value = test::get_backend();
|
||||||
} else if (field == "test") {
|
} else if (field == "test") {
|
||||||
char buf[128];
|
|
||||||
if (t.n_prompt > 0 && t.n_gen == 0) {
|
if (t.n_prompt > 0 && t.n_gen == 0) {
|
||||||
snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
|
snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
|
||||||
} else if (t.n_gen > 0 && t.n_prompt == 0) {
|
} else if (t.n_gen > 0 && t.n_prompt == 0) {
|
||||||
|
@ -785,7 +830,6 @@ struct markdown_printer : public printer {
|
||||||
}
|
}
|
||||||
value = buf;
|
value = buf;
|
||||||
} else if (field == "t/s") {
|
} else if (field == "t/s") {
|
||||||
char buf[128];
|
|
||||||
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
|
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
|
||||||
value = buf;
|
value = buf;
|
||||||
} else if (vmap.find(field) != vmap.end()) {
|
} else if (vmap.find(field) != vmap.end()) {
|
||||||
|
|
|
@ -189,12 +189,14 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
// Add BOS if SPM tokenizer
|
||||||
|
const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
} else {
|
} else {
|
||||||
embd_inp = session_tokens;
|
embd_inp = session_tokens;
|
||||||
}
|
}
|
||||||
|
@ -209,10 +211,9 @@ int main(int argc, char ** argv) {
|
||||||
int guidance_offset = 0;
|
int guidance_offset = 0;
|
||||||
int original_prompt_len = 0;
|
int original_prompt_len = 0;
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
params.cfg_negative_prompt.insert(0, 1, ' ');
|
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
|
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
original_prompt_len = original_inp.size();
|
original_prompt_len = original_inp.size();
|
||||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
||||||
}
|
}
|
||||||
|
@ -259,7 +260,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// prefix & suffix for instruct mode
|
// prefix & suffix for instruct mode
|
||||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm);
|
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
|
||||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
||||||
|
|
||||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||||
|
@ -278,7 +279,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
|
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
|
@ -286,14 +287,14 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
|
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_keep > 0) {
|
if (params.n_keep > 0) {
|
||||||
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
|
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
|
||||||
for (int i = 0; i < params.n_keep; i++) {
|
for (int i = 0; i < params.n_keep; i++) {
|
||||||
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
|
fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
fprintf(stderr, "'\n");
|
fprintf(stderr, "'\n");
|
||||||
}
|
}
|
||||||
|
@ -449,7 +450,7 @@ int main(int argc, char ** argv) {
|
||||||
//printf("\n---\n");
|
//printf("\n---\n");
|
||||||
//printf("resetting: '");
|
//printf("resetting: '");
|
||||||
//for (int i = 0; i < (int) embd.size(); i++) {
|
//for (int i = 0; i < (int) embd.size(); i++) {
|
||||||
// printf("%s", llama_token_to_str(ctx, embd[i]));
|
// printf("%s", llama_token_to_piece(ctx, embd[i]));
|
||||||
//}
|
//}
|
||||||
//printf("'\n");
|
//printf("'\n");
|
||||||
//printf("\n---\n");
|
//printf("\n---\n");
|
||||||
|
@ -502,7 +503,7 @@ int main(int argc, char ** argv) {
|
||||||
input_size = embd_guidance.size();
|
input_size = embd_guidance.size();
|
||||||
//fprintf(stderr, "\n---------------------\n");
|
//fprintf(stderr, "\n---------------------\n");
|
||||||
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
|
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
|
||||||
//fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
|
//fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
|
||||||
//}
|
//}
|
||||||
//fprintf(stderr, "\n---------------------\n");
|
//fprintf(stderr, "\n---------------------\n");
|
||||||
} else {
|
} else {
|
||||||
|
@ -597,7 +598,12 @@ int main(int argc, char ** argv) {
|
||||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||||
last_n_repeat, alpha_frequency, alpha_presence);
|
last_n_repeat, alpha_frequency, alpha_presence);
|
||||||
if (!penalize_nl) {
|
if (!penalize_nl) {
|
||||||
logits[llama_token_nl(ctx)] = nl_logit;
|
for (size_t idx = 0; idx < candidates_p.size; idx++) {
|
||||||
|
if (candidates_p.data[idx].id == llama_token_nl(ctx)) {
|
||||||
|
candidates_p.data[idx].logit = nl_logit;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (grammar != NULL) {
|
if (grammar != NULL) {
|
||||||
|
@ -661,7 +667,7 @@ int main(int argc, char ** argv) {
|
||||||
// display text
|
// display text
|
||||||
if (input_echo) {
|
if (input_echo) {
|
||||||
for (auto id : embd) {
|
for (auto id : embd) {
|
||||||
printf("%s", llama_token_to_str(ctx, id).c_str());
|
printf("%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
@ -677,7 +683,7 @@ int main(int argc, char ** argv) {
|
||||||
if (params.antiprompt.size()) {
|
if (params.antiprompt.size()) {
|
||||||
std::string last_output;
|
std::string last_output;
|
||||||
for (auto id : last_n_tokens) {
|
for (auto id : last_n_tokens) {
|
||||||
last_output += llama_token_to_str(ctx, id);
|
last_output += llama_token_to_piece(ctx, id);
|
||||||
}
|
}
|
||||||
|
|
||||||
is_antiprompt = false;
|
is_antiprompt = false;
|
||||||
|
|
|
@ -6,6 +6,8 @@
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <thread>
|
||||||
|
#include <mutex>
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
@ -27,6 +29,40 @@ std::vector<float> softmax(const std::vector<float>& logits) {
|
||||||
return probs;
|
return probs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float log_softmax(int n_vocab, const float * logits, int tok) {
|
||||||
|
float max_logit = logits[0];
|
||||||
|
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
|
||||||
|
double sum_exp = 0.0;
|
||||||
|
for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
|
||||||
|
return logits[tok] - max_logit - log(sum_exp);
|
||||||
|
}
|
||||||
|
|
||||||
|
void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread>& workers,
|
||||||
|
double& nll, double& nll2) {
|
||||||
|
|
||||||
|
std::mutex mutex;
|
||||||
|
int counter = 0;
|
||||||
|
auto compute = [&mutex, &counter, &nll, &nll2, n_vocab, logits, tokens, n_token] () {
|
||||||
|
double local_nll = 0, local_nll2 = 0;
|
||||||
|
while (true) {
|
||||||
|
std::unique_lock<std::mutex> lock(mutex);
|
||||||
|
int i = counter++;
|
||||||
|
if (i >= n_token) {
|
||||||
|
nll += local_nll; nll2 += local_nll2;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
lock.unlock();
|
||||||
|
double v = -log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
|
||||||
|
local_nll += v;
|
||||||
|
local_nll2 += v*v;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
for (auto& w : workers) w = std::thread(compute);
|
||||||
|
compute();
|
||||||
|
for (auto& w : workers) w.join();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||||
|
@ -154,10 +190,14 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
const bool add_bos = is_spm;
|
const bool add_bos = is_spm;
|
||||||
|
|
||||||
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
|
||||||
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
|
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
|
|
||||||
const int n_chunk_max = tokens.size() / params.n_ctx;
|
const int n_chunk_max = tokens.size() / params.n_ctx;
|
||||||
|
|
||||||
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
||||||
|
@ -166,9 +206,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
|
double nll2 = 0.0;
|
||||||
|
|
||||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||||
|
|
||||||
|
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||||
|
|
||||||
for (int i = 0; i < n_chunk; ++i) {
|
for (int i = 0; i < n_chunk; ++i) {
|
||||||
const int start = i * params.n_ctx;
|
const int start = i * params.n_ctx;
|
||||||
const int end = start + params.n_ctx;
|
const int end = start + params.n_ctx;
|
||||||
|
@ -228,26 +271,32 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
// Example, we have a context window of 512, we will compute perplexity for each of the
|
// Example, we have a context window of 512, we will compute perplexity for each of the
|
||||||
// last 256 tokens. Then, we split the input up into context window size chunks to
|
// last 256 tokens. Then, we split the input up into context window size chunks to
|
||||||
// process the entire prompt.
|
// process the entire prompt.
|
||||||
for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
|
const int first = std::min(512, params.n_ctx/2);
|
||||||
// Calculate probability of next token, given the previous ones.
|
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, workers, nll, nll2);
|
||||||
const std::vector<float> tok_logits(
|
count += params.n_ctx - first - 1;
|
||||||
logits.begin() + (j + 0) * n_vocab,
|
|
||||||
logits.begin() + (j + 1) * n_vocab);
|
|
||||||
|
|
||||||
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
|
||||||
|
|
||||||
nll += -std::log(prob);
|
|
||||||
++count;
|
|
||||||
}
|
|
||||||
// perplexity is e^(average negative log-likelihood)
|
// perplexity is e^(average negative log-likelihood)
|
||||||
if (params.ppl_output_type == 0) {
|
if (params.ppl_output_type == 0) {
|
||||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||||
} else {
|
} else {
|
||||||
printf("%8d %.4lf\n", i*params.n_ctx, std::exp(nll / count));
|
double av = nll/count;
|
||||||
|
double av2 = nll2/count - av*av;
|
||||||
|
if (av2 > 0) av2 = sqrt(av2/(count-1));
|
||||||
|
printf("%8d %.4lf %4lf %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2);
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
nll2 /= count;
|
||||||
|
nll /= count;
|
||||||
|
nll2 -= nll * nll;
|
||||||
|
if (nll2 > 0) {
|
||||||
|
nll2 = sqrt(nll2/(count-1));
|
||||||
|
double ppl = exp(nll);
|
||||||
|
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||||
|
} else {
|
||||||
|
printf("Unexpected negative standard deviation of log(prob)\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
|
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
|
||||||
|
@ -306,6 +355,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
||||||
|
|
||||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
// This is needed as usual for LLaMA models
|
||||||
const bool add_bos = is_spm;
|
const bool add_bos = is_spm;
|
||||||
|
@ -346,7 +396,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
hs_data[i].context = prompt_lines[idx*6];
|
hs_data[i].context = prompt_lines[idx*6];
|
||||||
hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
||||||
for (size_t j=0; j < 4; j++) {
|
for (size_t j=0; j < 4; j++) {
|
||||||
hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
|
hs_data[i].ending[j] = prompt_lines[idx*6+2+j];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Delete the selected random example from the prompt
|
// Delete the selected random example from the prompt
|
||||||
|
@ -361,6 +411,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
double acc = 0.0f;
|
double acc = 0.0f;
|
||||||
const int n_vocab = llama_n_vocab(ctx);
|
const int n_vocab = llama_n_vocab(ctx);
|
||||||
|
|
||||||
|
std::vector<std::vector<int>> ending_tokens(4);
|
||||||
|
|
||||||
std::vector<float> tok_logits(n_vocab);
|
std::vector<float> tok_logits(n_vocab);
|
||||||
|
|
||||||
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
||||||
|
@ -368,11 +420,21 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
|
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
|
||||||
size_t context_size = context_embd.size();
|
size_t context_size = context_embd.size();
|
||||||
|
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos);
|
||||||
|
for (int k = 0; k < int(context_size); ++k) {
|
||||||
|
if (ending_tokens[i][k] != context_embd[k]) {
|
||||||
|
fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Do the 1st ending
|
// Do the 1st ending
|
||||||
// In this case we include the context when evaluating
|
// In this case we include the context when evaluating
|
||||||
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
|
//auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
|
||||||
|
auto query_embd = ending_tokens[0];
|
||||||
auto query_size = query_embd.size();
|
auto query_size = query_embd.size();
|
||||||
//printf("First query: %d\n",(int)query_size);
|
|
||||||
|
|
||||||
// Stop if query wont fit the ctx window
|
// Stop if query wont fit the ctx window
|
||||||
if (query_size > (size_t)params.n_ctx) {
|
if (query_size > (size_t)params.n_ctx) {
|
||||||
|
@ -417,7 +479,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
|
for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
|
||||||
|
|
||||||
// Tokenize the query
|
// Tokenize the query
|
||||||
query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
|
query_embd.resize(ending_tokens[ending_idx].size() - context_size);
|
||||||
|
std::memcpy(query_embd.data(), ending_tokens[ending_idx].data() + context_size, query_embd.size()*sizeof(int));
|
||||||
query_size = query_embd.size();
|
query_size = query_embd.size();
|
||||||
|
|
||||||
// Stop if query wont fit the ctx window
|
// Stop if query wont fit the ctx window
|
||||||
|
|
|
@ -87,7 +87,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
auto next_token = llama_sample_token(ctx, &candidates_p);
|
auto next_token = llama_sample_token(ctx, &candidates_p);
|
||||||
auto next_token_str = llama_token_to_str(ctx, next_token);
|
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
||||||
last_n_tokens_data.push_back(next_token);
|
last_n_tokens_data.push_back(next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
|
@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
auto next_token = llama_sample_token(ctx2, &candidates_p);
|
auto next_token = llama_sample_token(ctx2, &candidates_p);
|
||||||
auto next_token_str = llama_token_to_str(ctx2, next_token);
|
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
||||||
last_n_tokens_data.push_back(next_token);
|
last_n_tokens_data.push_back(next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
|
|
|
@ -77,34 +77,31 @@ You need to have [Node.js](https://nodejs.org/en) installed.
|
||||||
```bash
|
```bash
|
||||||
mkdir llama-client
|
mkdir llama-client
|
||||||
cd llama-client
|
cd llama-client
|
||||||
npm init
|
|
||||||
npm install axios
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Create a index.js file and put inside this:
|
Create a index.js file and put inside this:
|
||||||
|
|
||||||
```javascript
|
```javascript
|
||||||
const axios = require("axios");
|
|
||||||
|
|
||||||
const prompt = `Building a website can be done in 10 simple steps:`;
|
const prompt = `Building a website can be done in 10 simple steps:`;
|
||||||
|
|
||||||
async function Test() {
|
async function Test() {
|
||||||
let result = await axios.post("http://127.0.0.1:8080/completion", {
|
let response = await fetch("http://127.0.0.1:8080/completion", {
|
||||||
prompt,
|
method: 'POST',
|
||||||
n_predict: 512,
|
body: JSON.stringify({
|
||||||
});
|
prompt,
|
||||||
|
n_predict: 512,
|
||||||
// the response is received until completion finish
|
})
|
||||||
console.log(result.data.content);
|
})
|
||||||
|
console.log((await response.json()).content)
|
||||||
}
|
}
|
||||||
|
|
||||||
Test();
|
Test()
|
||||||
```
|
```
|
||||||
|
|
||||||
And run it:
|
And run it:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
node .
|
node index.js
|
||||||
```
|
```
|
||||||
|
|
||||||
## API Endpoints
|
## API Endpoints
|
||||||
|
@ -167,6 +164,12 @@ node .
|
||||||
|
|
||||||
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
|
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
|
||||||
|
|
||||||
|
- **POST** `/detokenize`: Convert tokens to text.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
`tokens`: Set the tokens to detokenize.
|
||||||
|
|
||||||
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
|
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
|
@ -94,7 +94,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
||||||
std::string ret;
|
std::string ret;
|
||||||
for (; begin != end; ++begin)
|
for (; begin != end; ++begin)
|
||||||
{
|
{
|
||||||
ret += llama_token_to_str(ctx, *begin);
|
ret += llama_token_to_piece(ctx, *begin);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -123,7 +123,7 @@ static void server_log(const char *level, const char *function, int line,
|
||||||
// format incomplete utf-8 multibyte character for output
|
// format incomplete utf-8 multibyte character for output
|
||||||
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
||||||
{
|
{
|
||||||
std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
|
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
|
||||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||||
// (size > 1 meaning it's already a known token)
|
// (size > 1 meaning it's already a known token)
|
||||||
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
||||||
|
@ -286,7 +286,6 @@ struct llama_server_context
|
||||||
std::vector<llama_token> p;
|
std::vector<llama_token> p;
|
||||||
if (first)
|
if (first)
|
||||||
{
|
{
|
||||||
s.insert(0, 1, ' '); // add a space if it's the first
|
|
||||||
p = ::llama_tokenize(ctx, s, add_bos);
|
p = ::llama_tokenize(ctx, s, add_bos);
|
||||||
first = false;
|
first = false;
|
||||||
}
|
}
|
||||||
|
@ -309,7 +308,6 @@ struct llama_server_context
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
auto s = json_prompt.template get<std::string>();
|
auto s = json_prompt.template get<std::string>();
|
||||||
s.insert(0, 1, ' '); // always add a first space
|
|
||||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
|
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -566,7 +564,7 @@ struct llama_server_context
|
||||||
|
|
||||||
if (!embd.empty() && embd.back() == llama_token_eos(ctx))
|
if (!embd.empty() && embd.back() == llama_token_eos(ctx))
|
||||||
{
|
{
|
||||||
// stopping_word = llama_token_to_str(ctx, embd.back());
|
// stopping_word = llama_token_to_piece(ctx, embd.back());
|
||||||
has_next_token = false;
|
has_next_token = false;
|
||||||
stopped_eos = true;
|
stopped_eos = true;
|
||||||
LOG_VERBOSE("eos token found", {});
|
LOG_VERBOSE("eos token found", {});
|
||||||
|
@ -613,7 +611,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
const completion_token_output token_with_probs = nextToken();
|
const completion_token_output token_with_probs = nextToken();
|
||||||
|
|
||||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok);
|
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
||||||
generated_text += token_text;
|
generated_text += token_text;
|
||||||
|
|
||||||
if (params.n_probs > 0)
|
if (params.n_probs > 0)
|
||||||
|
@ -1104,6 +1102,12 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
|
||||||
{"tokens", tokens}};
|
{"tokens", tokens}};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static json format_detokenized_response(std::string content)
|
||||||
|
{
|
||||||
|
return json{
|
||||||
|
{"content", content}};
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static T json_value(const json &body, const std::string &key, const T &default_value)
|
static T json_value(const json &body, const std::string &key, const T &default_value)
|
||||||
{
|
{
|
||||||
|
@ -1209,6 +1213,62 @@ static void log_server_request(const Request &req, const Response &res)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
|
||||||
|
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function matching type llama_beam_search_callback_fn_t.
|
||||||
|
// Custom callback example is called each time the beams lengths increase:
|
||||||
|
// * Show progress by printing ',' following by number of convergent beam tokens if any.
|
||||||
|
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
||||||
|
// This is also called when the stop condition is met.
|
||||||
|
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
||||||
|
void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
|
||||||
|
auto & llama = *static_cast<llama_server_context*>(callback_data);
|
||||||
|
// Mark beams as EOS as needed.
|
||||||
|
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||||
|
llama_beam_view& beam_view = beams_state.beam_views[i];
|
||||||
|
if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
|
||||||
|
beam_view.eob = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf(","); // Show progress
|
||||||
|
if (const size_t n = beams_state.common_prefix_length) {
|
||||||
|
llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
|
||||||
|
assert(0u < beams_state.n_beams);
|
||||||
|
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
||||||
|
const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
|
||||||
|
std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
|
||||||
|
printf("%lu", n);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
#if 0 // DEBUG: print current beams for this iteration
|
||||||
|
std::cout << "\n\nCurrent beams:\n";
|
||||||
|
for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
|
||||||
|
std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
struct token_translator {
|
||||||
|
llama_context * ctx;
|
||||||
|
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
|
||||||
|
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
|
||||||
|
};
|
||||||
|
|
||||||
|
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
|
||||||
|
auto & gtps = llama.generated_token_probs;
|
||||||
|
auto translator = token_translator{llama.ctx};
|
||||||
|
auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
|
||||||
|
const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
|
||||||
|
if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
|
||||||
|
llama.generated_text.reserve(llama.generated_text.size() + len);
|
||||||
|
}
|
||||||
|
for (const completion_token_output & cto : gtps) {
|
||||||
|
llama.generated_text += translator(cto);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
// own arguments required by this example
|
// own arguments required by this example
|
||||||
|
@ -1291,22 +1351,30 @@ int main(int argc, char **argv)
|
||||||
llama.beginCompletion();
|
llama.beginCompletion();
|
||||||
|
|
||||||
if (!llama.stream) {
|
if (!llama.stream) {
|
||||||
size_t stop_pos = std::string::npos;
|
if (llama.params.n_beams) {
|
||||||
|
// Fill llama.generated_token_probs vector with final beam.
|
||||||
|
llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
|
||||||
|
llama.n_past, llama.n_remain, llama.params.n_threads);
|
||||||
|
// Translate llama.generated_token_probs to llama.generated_text.
|
||||||
|
append_to_generated_text_from_generated_token_probs(llama);
|
||||||
|
} else {
|
||||||
|
size_t stop_pos = std::string::npos;
|
||||||
|
|
||||||
while (llama.has_next_token) {
|
while (llama.has_next_token) {
|
||||||
const completion_token_output token_with_probs = llama.doCompletion();
|
const completion_token_output token_with_probs = llama.doCompletion();
|
||||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
|
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok);
|
||||||
|
|
||||||
stop_pos = llama.findStoppingStrings(llama.generated_text,
|
stop_pos = llama.findStoppingStrings(llama.generated_text,
|
||||||
token_text.size(), STOP_FULL);
|
token_text.size(), STOP_FULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (stop_pos == std::string::npos) {
|
if (stop_pos == std::string::npos) {
|
||||||
stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
|
stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
|
||||||
}
|
}
|
||||||
if (stop_pos != std::string::npos) {
|
if (stop_pos != std::string::npos) {
|
||||||
llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
|
llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
|
||||||
llama.generated_text.end());
|
llama.generated_text.end());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
|
const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
|
||||||
|
@ -1325,7 +1393,7 @@ int main(int argc, char **argv)
|
||||||
if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
|
if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const std::string token_text = llama_token_to_str(llama.ctx, token_with_probs.tok);
|
const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
|
||||||
|
|
||||||
size_t pos = std::min(sent_count, llama.generated_text.size());
|
size_t pos = std::min(sent_count, llama.generated_text.size());
|
||||||
|
|
||||||
|
@ -1437,6 +1505,21 @@ int main(int argc, char **argv)
|
||||||
const json data = format_tokenizer_response(tokens);
|
const json data = format_tokenizer_response(tokens);
|
||||||
return res.set_content(data.dump(), "application/json"); });
|
return res.set_content(data.dump(), "application/json"); });
|
||||||
|
|
||||||
|
svr.Post("/detokenize", [&llama](const Request &req, Response &res)
|
||||||
|
{
|
||||||
|
auto lock = llama.lock();
|
||||||
|
|
||||||
|
const json body = json::parse(req.body);
|
||||||
|
std::string content;
|
||||||
|
if (body.count("tokens") != 0)
|
||||||
|
{
|
||||||
|
const std::vector<llama_token> tokens = body["tokens"];
|
||||||
|
content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
|
||||||
|
}
|
||||||
|
|
||||||
|
const json data = format_detokenized_response(content);
|
||||||
|
return res.set_content(data.dump(), "application/json"); });
|
||||||
|
|
||||||
svr.Post("/embedding", [&llama](const Request &req, Response &res)
|
svr.Post("/embedding", [&llama](const Request &req, Response &res)
|
||||||
{
|
{
|
||||||
auto lock = llama.lock();
|
auto lock = llama.lock();
|
||||||
|
|
|
@ -63,7 +63,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "\n\n");
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
for (auto id : tokens_list) {
|
for (auto id : tokens_list) {
|
||||||
fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
|
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
@ -112,7 +112,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// print the new token :
|
// print the new token :
|
||||||
printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
|
printf("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
// push this new token for next evaluation
|
// push this new token for next evaluation
|
||||||
|
|
|
@ -1964,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) {
|
||||||
|
|
||||||
|
|
||||||
void print_token(struct llama_context * ctx, llama_token token) {
|
void print_token(struct llama_context * ctx, llama_token token) {
|
||||||
printf("%s", llama_token_to_str(ctx, token).c_str());
|
printf("%s", llama_token_to_piece(ctx, token).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
|
void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
|
||||||
|
@ -2202,7 +2202,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
|
||||||
const char * in = buf.data();
|
const char * in = buf.data();
|
||||||
const char * end = buf.data() + buf.size();
|
const char * end = buf.data() + buf.size();
|
||||||
for (int i = 0; i < (int) out.size(); ++i) {
|
for (int i = 0; i < (int) out.size(); ++i) {
|
||||||
std::string s = llama_token_to_str(lctx, out[i]);
|
std::string s = llama_token_to_piece(lctx, out[i]);
|
||||||
int len = s.length();
|
int len = s.length();
|
||||||
if (in >= end) {
|
if (in >= end) {
|
||||||
printf("%s: unexpected end of original text.\n", __func__);
|
printf("%s: unexpected end of original text.\n", __func__);
|
||||||
|
|
12
flake.lock
generated
12
flake.lock
generated
|
@ -5,11 +5,11 @@
|
||||||
"systems": "systems"
|
"systems": "systems"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1685518550,
|
"lastModified": 1692799911,
|
||||||
"narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
|
"narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
|
||||||
"owner": "numtide",
|
"owner": "numtide",
|
||||||
"repo": "flake-utils",
|
"repo": "flake-utils",
|
||||||
"rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
|
"rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1685931219,
|
"lastModified": 1692913444,
|
||||||
"narHash": "sha256-8EWeOZ6LKQfgAjB/USffUSELPRjw88A+xTcXnOUvO5M=",
|
"narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "7409480d5c8584a1a83c422530419efe4afb0d19",
|
"rev": "18324978d632ffc55ef1d928e81630c620f4f447",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
54
flake.nix
54
flake.nix
|
@ -6,6 +6,9 @@
|
||||||
outputs = { self, nixpkgs, flake-utils }:
|
outputs = { self, nixpkgs, flake-utils }:
|
||||||
flake-utils.lib.eachDefaultSystem (system:
|
flake-utils.lib.eachDefaultSystem (system:
|
||||||
let
|
let
|
||||||
|
name = "llama.cpp";
|
||||||
|
src = ./.;
|
||||||
|
meta.mainProgram = "llama";
|
||||||
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
||||||
buildInputs = with pkgs; [ openmpi ];
|
buildInputs = with pkgs; [ openmpi ];
|
||||||
osSpecific = with pkgs; buildInputs ++
|
osSpecific = with pkgs; buildInputs ++
|
||||||
|
@ -21,11 +24,17 @@
|
||||||
CoreGraphics
|
CoreGraphics
|
||||||
CoreVideo
|
CoreVideo
|
||||||
]
|
]
|
||||||
|
else if isDarwin then
|
||||||
|
with pkgs.darwin.apple_sdk.frameworks; [
|
||||||
|
Accelerate
|
||||||
|
CoreGraphics
|
||||||
|
CoreVideo
|
||||||
|
]
|
||||||
else
|
else
|
||||||
with pkgs; [ openblas ]
|
with pkgs; [ openblas ]
|
||||||
);
|
);
|
||||||
pkgs = import nixpkgs { inherit system; };
|
pkgs = import nixpkgs { inherit system; };
|
||||||
nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
|
nativeBuildInputs = with pkgs; [ cmake ninja pkgconfig ];
|
||||||
llama-python =
|
llama-python =
|
||||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||||
postPatch = ''
|
postPatch = ''
|
||||||
|
@ -38,35 +47,35 @@
|
||||||
mv $out/bin/server $out/bin/llama-server
|
mv $out/bin/server $out/bin/llama-server
|
||||||
'';
|
'';
|
||||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
||||||
in {
|
in
|
||||||
|
{
|
||||||
packages.default = pkgs.stdenv.mkDerivation {
|
packages.default = pkgs.stdenv.mkDerivation {
|
||||||
name = "llama.cpp";
|
inherit name src meta postPatch nativeBuildInputs buildInputs postInstall;
|
||||||
src = ./.;
|
|
||||||
postPatch = postPatch;
|
|
||||||
nativeBuildInputs = nativeBuildInputs;
|
|
||||||
buildInputs = osSpecific;
|
|
||||||
cmakeFlags = cmakeFlags
|
cmakeFlags = cmakeFlags
|
||||||
++ (if isAarch64 && isDarwin then [
|
++ (if isAarch64 && isDarwin then [
|
||||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||||
"-DLLAMA_METAL=ON"
|
"-DLLAMA_METAL=ON"
|
||||||
] else [
|
] else [
|
||||||
"-DLLAMA_BLAS=ON"
|
"-DLLAMA_BLAS=ON"
|
||||||
"-DLLAMA_BLAS_VENDOR=OpenBLAS"
|
"-DLLAMA_BLAS_VENDOR=OpenBLAS"
|
||||||
]);
|
]);
|
||||||
postInstall = postInstall;
|
|
||||||
meta.mainProgram = "llama";
|
|
||||||
};
|
};
|
||||||
packages.opencl = pkgs.stdenv.mkDerivation {
|
packages.opencl = pkgs.stdenv.mkDerivation {
|
||||||
name = "llama.cpp";
|
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||||
src = ./.;
|
|
||||||
postPatch = postPatch;
|
|
||||||
nativeBuildInputs = nativeBuildInputs;
|
|
||||||
buildInputs = with pkgs; buildInputs ++ [ clblast ];
|
buildInputs = with pkgs; buildInputs ++ [ clblast ];
|
||||||
cmakeFlags = cmakeFlags ++ [
|
cmakeFlags = cmakeFlags ++ [
|
||||||
"-DLLAMA_CLBLAST=ON"
|
"-DLLAMA_CLBLAST=ON"
|
||||||
];
|
];
|
||||||
postInstall = postInstall;
|
};
|
||||||
meta.mainProgram = "llama";
|
packages.rocm = pkgs.stdenv.mkDerivation {
|
||||||
|
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||||
|
buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
|
||||||
|
cmakeFlags = cmakeFlags ++ [
|
||||||
|
"-DLLAMA_HIPBLAS=1"
|
||||||
|
"-DCMAKE_C_COMPILER=hipcc"
|
||||||
|
"-DCMAKE_CXX_COMPILER=hipcc"
|
||||||
|
"-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
|
||||||
|
];
|
||||||
};
|
};
|
||||||
apps.llama-server = {
|
apps.llama-server = {
|
||||||
type = "app";
|
type = "app";
|
||||||
|
@ -80,8 +89,13 @@
|
||||||
type = "app";
|
type = "app";
|
||||||
program = "${self.packages.${system}.default}/bin/llama";
|
program = "${self.packages.${system}.default}/bin/llama";
|
||||||
};
|
};
|
||||||
|
apps.quantize = {
|
||||||
|
type = "app";
|
||||||
|
program = "${self.packages.${system}.default}/bin/quantize";
|
||||||
|
};
|
||||||
apps.default = self.apps.${system}.llama;
|
apps.default = self.apps.${system}.llama;
|
||||||
devShells.default = pkgs.mkShell {
|
devShells.default = pkgs.mkShell {
|
||||||
|
buildInputs = [ llama-python ];
|
||||||
packages = nativeBuildInputs ++ osSpecific;
|
packages = nativeBuildInputs ++ osSpecific;
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
27
ggml-cuda.cu
27
ggml-cuda.cu
|
@ -306,11 +306,11 @@ typedef struct {
|
||||||
#define QI4_K (QK_K / (4*QR4_K))
|
#define QI4_K (QK_K / (4*QR4_K))
|
||||||
#ifdef GGML_QKK_64
|
#ifdef GGML_QKK_64
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d[2]; // super-block scales/mins
|
half dm[2]; // super-block scales/mins
|
||||||
uint8_t scales[2]; // 4-bit block scales/mins
|
uint8_t scales[2]; // 4-bit block scales/mins
|
||||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||||
} block_q4_K;
|
} block_q4_K;
|
||||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
||||||
#else
|
#else
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half2 dm; // super-block scale for quantized scales/mins
|
half2 dm; // super-block scale for quantized scales/mins
|
||||||
|
@ -737,8 +737,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const uint8_t * q = x[i].qs;
|
const uint8_t * q = x[i].qs;
|
||||||
float * y = yy + i*QK_K;
|
float * y = yy + i*QK_K;
|
||||||
const float d = (float)x[i].d[0];
|
const float d = (float)x[i].dm[0];
|
||||||
const float m = (float)x[i].d[1];
|
const float m = (float)x[i].dm[1];
|
||||||
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
||||||
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
||||||
#endif
|
#endif
|
||||||
|
@ -1155,8 +1155,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||||
const uint16_t * a = (const uint16_t *)x[i].scales;
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
||||||
aux16[0] = a[0] & 0x0f0f;
|
aux16[0] = a[0] & 0x0f0f;
|
||||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||||
const float d = (float)x[i].d[0];
|
const float d = (float)x[i].dm[0];
|
||||||
const float m = (float)x[i].d[1];
|
const float m = (float)x[i].dm[1];
|
||||||
float sum = 0.f;
|
float sum = 0.f;
|
||||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||||
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
||||||
|
@ -2845,8 +2845,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||||
aux16[0] = a[0] & 0x0f0f;
|
aux16[0] = a[0] & 0x0f0f;
|
||||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||||
|
|
||||||
const float dall = bq4_K->d[0];
|
const float dall = bq4_K->dm[0];
|
||||||
const float dmin = bq4_K->d[1];
|
const float dmin = bq4_K->dm[1];
|
||||||
|
|
||||||
const float d8_1 = __low2float(bq8_1[0].ds);
|
const float d8_1 = __low2float(bq8_1[0].ds);
|
||||||
const float d8_2 = __low2float(bq8_1[1].ds);
|
const float d8_2 = __low2float(bq8_1[1].ds);
|
||||||
|
@ -2929,7 +2929,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
|
|
||||||
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
||||||
|
#else
|
||||||
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -3119,7 +3123,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
|
|
||||||
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -4709,6 +4715,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
||||||
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
||||||
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_compute_capabilities[id];
|
||||||
|
@ -4740,6 +4748,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
||||||
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
||||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
||||||
|
@ -6328,9 +6337,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
|
||||||
|
|
||||||
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
||||||
|
|
||||||
const int mode = ((int32_t *) dst->op_params)[2];
|
const int mode = ((int32_t *) dst->op_params)[2];
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
|
||||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
145
ggml.c
145
ggml.c
|
@ -19394,7 +19394,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
struct gguf_str {
|
struct gguf_str {
|
||||||
uint32_t n;
|
uint64_t n; // GGUFv2
|
||||||
char * data;
|
char * data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -19408,9 +19408,12 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
|
||||||
[GGUF_TYPE_FLOAT32] = sizeof(float),
|
[GGUF_TYPE_FLOAT32] = sizeof(float),
|
||||||
[GGUF_TYPE_BOOL] = sizeof(bool),
|
[GGUF_TYPE_BOOL] = sizeof(bool),
|
||||||
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
|
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
|
||||||
|
[GGUF_TYPE_UINT64] = sizeof(uint64_t),
|
||||||
|
[GGUF_TYPE_INT64] = sizeof(int64_t),
|
||||||
|
[GGUF_TYPE_FLOAT64] = sizeof(double),
|
||||||
[GGUF_TYPE_ARRAY] = 0, // undefined
|
[GGUF_TYPE_ARRAY] = 0, // undefined
|
||||||
};
|
};
|
||||||
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
|
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
||||||
|
|
||||||
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
||||||
[GGUF_TYPE_UINT8] = "u8",
|
[GGUF_TYPE_UINT8] = "u8",
|
||||||
|
@ -19423,8 +19426,11 @@ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
||||||
[GGUF_TYPE_BOOL] = "bool",
|
[GGUF_TYPE_BOOL] = "bool",
|
||||||
[GGUF_TYPE_STRING] = "str",
|
[GGUF_TYPE_STRING] = "str",
|
||||||
[GGUF_TYPE_ARRAY] = "arr",
|
[GGUF_TYPE_ARRAY] = "arr",
|
||||||
|
[GGUF_TYPE_UINT64] = "u64",
|
||||||
|
[GGUF_TYPE_INT64] = "i64",
|
||||||
|
[GGUF_TYPE_FLOAT64] = "f64",
|
||||||
};
|
};
|
||||||
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
|
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
||||||
|
|
||||||
union gguf_value {
|
union gguf_value {
|
||||||
uint8_t uint8;
|
uint8_t uint8;
|
||||||
|
@ -19434,6 +19440,9 @@ union gguf_value {
|
||||||
uint32_t uint32;
|
uint32_t uint32;
|
||||||
int32_t int32;
|
int32_t int32;
|
||||||
float float32;
|
float float32;
|
||||||
|
uint64_t uint64;
|
||||||
|
int64_t int64;
|
||||||
|
double float64;
|
||||||
bool bool_;
|
bool bool_;
|
||||||
|
|
||||||
struct gguf_str str;
|
struct gguf_str str;
|
||||||
|
@ -19441,7 +19450,7 @@ union gguf_value {
|
||||||
struct {
|
struct {
|
||||||
enum gguf_type type;
|
enum gguf_type type;
|
||||||
|
|
||||||
uint32_t n;
|
uint64_t n; // GGUFv2
|
||||||
void * data;
|
void * data;
|
||||||
} arr;
|
} arr;
|
||||||
};
|
};
|
||||||
|
@ -19449,8 +19458,6 @@ union gguf_value {
|
||||||
struct gguf_kv {
|
struct gguf_kv {
|
||||||
struct gguf_str key;
|
struct gguf_str key;
|
||||||
|
|
||||||
uint32_t n_bytes; // TODO: is this actually needed?
|
|
||||||
|
|
||||||
enum gguf_type type;
|
enum gguf_type type;
|
||||||
union gguf_value value;
|
union gguf_value value;
|
||||||
};
|
};
|
||||||
|
@ -19458,15 +19465,15 @@ struct gguf_kv {
|
||||||
struct gguf_header {
|
struct gguf_header {
|
||||||
uint32_t magic;
|
uint32_t magic;
|
||||||
uint32_t version;
|
uint32_t version;
|
||||||
uint32_t n_tensors;
|
uint64_t n_tensors; // GGUFv2
|
||||||
uint32_t n_kv;
|
uint64_t n_kv; // GGUFv2
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gguf_tensor_info {
|
struct gguf_tensor_info {
|
||||||
struct gguf_str name;
|
struct gguf_str name;
|
||||||
|
|
||||||
uint32_t n_dims;
|
uint32_t n_dims;
|
||||||
uint32_t ne[GGML_MAX_DIMS];
|
uint64_t ne[GGML_MAX_DIMS];
|
||||||
|
|
||||||
enum ggml_type type;
|
enum ggml_type type;
|
||||||
|
|
||||||
|
@ -19497,19 +19504,32 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
|
||||||
return n == size;
|
return n == size;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||||
|
static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||||
p->n = 0;
|
p->n = 0;
|
||||||
p->data = NULL;
|
p->data = NULL;
|
||||||
|
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
|
|
||||||
// TODO: how to avoid mallocs for strings?
|
|
||||||
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
||||||
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
||||||
|
|
||||||
return ok;
|
return ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||||
|
p->n = 0;
|
||||||
|
p->data = NULL;
|
||||||
|
|
||||||
|
bool ok = true;
|
||||||
|
|
||||||
|
uint32_t n = 0;
|
||||||
|
ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
|
||||||
|
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
||||||
|
|
||||||
|
return ok;
|
||||||
|
}
|
||||||
|
|
||||||
struct gguf_context * gguf_init_empty(void) {
|
struct gguf_context * gguf_init_empty(void) {
|
||||||
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
||||||
|
|
||||||
|
@ -19565,8 +19585,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
ctx->data = NULL;
|
ctx->data = NULL;
|
||||||
|
|
||||||
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
||||||
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
|
||||||
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
if (ctx->header.version == 1) {
|
||||||
|
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||||
|
uint32_t n_tensors = 0;
|
||||||
|
uint32_t n_kv = 0;
|
||||||
|
|
||||||
|
ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
|
||||||
|
ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
|
||||||
|
|
||||||
|
ctx->header.n_tensors = n_tensors;
|
||||||
|
ctx->header.n_kv = n_kv;
|
||||||
|
} else {
|
||||||
|
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
||||||
|
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
||||||
|
}
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
fprintf(stderr, "%s: failed to read header\n", __func__);
|
fprintf(stderr, "%s: failed to read header\n", __func__);
|
||||||
|
@ -19576,6 +19609,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||||
|
bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
|
||||||
|
if (ctx->header.version == 1) {
|
||||||
|
gguf_fread_str = gguf_fread_str_v1;
|
||||||
|
}
|
||||||
|
|
||||||
// read the kv pairs
|
// read the kv pairs
|
||||||
{
|
{
|
||||||
ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||||
|
@ -19585,9 +19624,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
|
|
||||||
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
||||||
|
|
||||||
ok = ok && gguf_fread_str(file, &kv->key, &offset);
|
ok = ok && gguf_fread_str(file, &kv->key, &offset);
|
||||||
//ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
|
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
||||||
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
|
||||||
|
|
||||||
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
|
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
|
||||||
|
|
||||||
|
@ -19599,12 +19637,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
|
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
|
||||||
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
|
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
|
||||||
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
|
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
|
||||||
|
case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
|
||||||
|
case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
|
||||||
|
case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
|
||||||
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
|
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
|
||||||
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
|
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
|
||||||
case GGUF_TYPE_ARRAY:
|
case GGUF_TYPE_ARRAY:
|
||||||
{
|
{
|
||||||
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
||||||
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
|
||||||
|
if (ctx->header.version == 1) {
|
||||||
|
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||||
|
uint32_t n = 0;
|
||||||
|
ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
|
||||||
|
kv->value.arr.n = n;
|
||||||
|
} else {
|
||||||
|
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
||||||
|
}
|
||||||
|
|
||||||
switch (kv->value.arr.type) {
|
switch (kv->value.arr.type) {
|
||||||
case GGUF_TYPE_UINT8:
|
case GGUF_TYPE_UINT8:
|
||||||
|
@ -19614,6 +19663,9 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
case GGUF_TYPE_UINT32:
|
case GGUF_TYPE_UINT32:
|
||||||
case GGUF_TYPE_INT32:
|
case GGUF_TYPE_INT32:
|
||||||
case GGUF_TYPE_FLOAT32:
|
case GGUF_TYPE_FLOAT32:
|
||||||
|
case GGUF_TYPE_UINT64:
|
||||||
|
case GGUF_TYPE_INT64:
|
||||||
|
case GGUF_TYPE_FLOAT64:
|
||||||
case GGUF_TYPE_BOOL:
|
case GGUF_TYPE_BOOL:
|
||||||
{
|
{
|
||||||
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||||
|
@ -19660,7 +19712,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
||||||
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
||||||
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
||||||
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
if (ctx->header.version == 1) {
|
||||||
|
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||||
|
uint32_t t = 0;
|
||||||
|
ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
|
||||||
|
info->ne[j] = t;
|
||||||
|
} else {
|
||||||
|
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
||||||
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
||||||
|
@ -19954,6 +20013,18 @@ float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
||||||
return ctx->kv[i].value.float32;
|
return ctx->kv[i].value.float32;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
|
||||||
|
return ctx->kv[i].value.uint64;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
|
||||||
|
return ctx->kv[i].value.int64;
|
||||||
|
}
|
||||||
|
|
||||||
|
double gguf_get_val_f64(struct gguf_context * ctx, int i) {
|
||||||
|
return ctx->kv[i].value.float64;
|
||||||
|
}
|
||||||
|
|
||||||
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
||||||
return ctx->kv[i].value.bool_;
|
return ctx->kv[i].value.bool_;
|
||||||
}
|
}
|
||||||
|
@ -20056,6 +20127,27 @@ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
|
||||||
ctx->kv[idx].value.float32 = val;
|
ctx->kv[idx].value.float32 = val;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_UINT64;
|
||||||
|
ctx->kv[idx].value.uint64 = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_INT64;
|
||||||
|
ctx->kv[idx].value.int64 = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
|
||||||
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
|
||||||
|
ctx->kv[idx].value.float64 = val;
|
||||||
|
}
|
||||||
|
|
||||||
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
|
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
|
||||||
const int idx = gguf_get_or_add_key(ctx, key);
|
const int idx = gguf_get_or_add_key(ctx, key);
|
||||||
|
|
||||||
|
@ -20106,6 +20198,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
||||||
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
|
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
|
||||||
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
|
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
|
||||||
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
|
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
|
||||||
|
case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
|
||||||
|
case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
|
||||||
|
case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
|
||||||
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
|
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
|
||||||
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
|
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
|
||||||
case GGUF_TYPE_ARRAY:
|
case GGUF_TYPE_ARRAY:
|
||||||
|
@ -20267,6 +20362,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
|
||||||
case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
|
case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
|
||||||
case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
|
case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
|
||||||
case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
|
case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
|
||||||
|
case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
|
||||||
|
case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
|
||||||
|
case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
|
||||||
case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
|
case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
|
||||||
case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
|
case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
|
||||||
case GGUF_TYPE_ARRAY:
|
case GGUF_TYPE_ARRAY:
|
||||||
|
@ -20282,6 +20380,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
|
||||||
case GGUF_TYPE_UINT32:
|
case GGUF_TYPE_UINT32:
|
||||||
case GGUF_TYPE_INT32:
|
case GGUF_TYPE_INT32:
|
||||||
case GGUF_TYPE_FLOAT32:
|
case GGUF_TYPE_FLOAT32:
|
||||||
|
case GGUF_TYPE_UINT64:
|
||||||
|
case GGUF_TYPE_INT64:
|
||||||
|
case GGUF_TYPE_FLOAT64:
|
||||||
case GGUF_TYPE_BOOL:
|
case GGUF_TYPE_BOOL:
|
||||||
{
|
{
|
||||||
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||||
|
@ -20516,6 +20617,14 @@ int ggml_cpu_has_sse3(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_ssse3(void) {
|
||||||
|
#if defined(__SSSE3__)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_vsx(void) {
|
int ggml_cpu_has_vsx(void) {
|
||||||
#if defined(__POWER9_VECTOR__)
|
#if defined(__POWER9_VECTOR__)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
12
ggml.h
12
ggml.h
|
@ -216,7 +216,7 @@
|
||||||
#define GGML_EXIT_ABORTED 1
|
#define GGML_EXIT_ABORTED 1
|
||||||
|
|
||||||
#define GGUF_MAGIC 0x46554747 // "GGUF"
|
#define GGUF_MAGIC 0x46554747 // "GGUF"
|
||||||
#define GGUF_VERSION 1
|
#define GGUF_VERSION 2
|
||||||
|
|
||||||
#define GGUF_DEFAULT_ALIGNMENT 32
|
#define GGUF_DEFAULT_ALIGNMENT 32
|
||||||
|
|
||||||
|
@ -1827,6 +1827,9 @@ extern "C" {
|
||||||
GGUF_TYPE_BOOL = 7,
|
GGUF_TYPE_BOOL = 7,
|
||||||
GGUF_TYPE_STRING = 8,
|
GGUF_TYPE_STRING = 8,
|
||||||
GGUF_TYPE_ARRAY = 9,
|
GGUF_TYPE_ARRAY = 9,
|
||||||
|
GGUF_TYPE_UINT64 = 10,
|
||||||
|
GGUF_TYPE_INT64 = 11,
|
||||||
|
GGUF_TYPE_FLOAT64 = 12,
|
||||||
GGUF_TYPE_COUNT, // marks the end of the enum
|
GGUF_TYPE_COUNT, // marks the end of the enum
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1867,6 +1870,9 @@ extern "C" {
|
||||||
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
||||||
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
||||||
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
||||||
|
GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
|
||||||
|
GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
|
||||||
|
GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
|
||||||
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
||||||
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
||||||
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
||||||
|
@ -1886,6 +1892,9 @@ extern "C" {
|
||||||
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
||||||
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
||||||
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
||||||
|
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
||||||
|
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
||||||
|
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
||||||
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
||||||
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
||||||
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
||||||
|
@ -1944,6 +1953,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_clblast (void);
|
GGML_API int ggml_cpu_has_clblast (void);
|
||||||
GGML_API int ggml_cpu_has_gpublas (void);
|
GGML_API int ggml_cpu_has_gpublas (void);
|
||||||
GGML_API int ggml_cpu_has_sse3 (void);
|
GGML_API int ggml_cpu_has_sse3 (void);
|
||||||
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||||
GGML_API int ggml_cpu_has_vsx (void);
|
GGML_API int ggml_cpu_has_vsx (void);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
|
@ -13,7 +13,7 @@ from typing import Any, IO, List, Optional
|
||||||
#
|
#
|
||||||
|
|
||||||
GGUF_MAGIC = 0x46554747
|
GGUF_MAGIC = 0x46554747
|
||||||
GGUF_VERSION = 1
|
GGUF_VERSION = 2
|
||||||
GGUF_DEFAULT_ALIGNMENT = 32
|
GGUF_DEFAULT_ALIGNMENT = 32
|
||||||
|
|
||||||
# general
|
# general
|
||||||
|
@ -365,6 +365,9 @@ class GGUFValueType(IntEnum):
|
||||||
BOOL = 7
|
BOOL = 7
|
||||||
STRING = 8
|
STRING = 8
|
||||||
ARRAY = 9
|
ARRAY = 9
|
||||||
|
UINT64 = 10
|
||||||
|
INT64 = 11
|
||||||
|
FLOAT64 = 12
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_type(val):
|
def get_type(val):
|
||||||
|
@ -378,6 +381,7 @@ class GGUFValueType(IntEnum):
|
||||||
return GGUFValueType.BOOL
|
return GGUFValueType.BOOL
|
||||||
elif isinstance(val, int):
|
elif isinstance(val, int):
|
||||||
return GGUFValueType.INT32
|
return GGUFValueType.INT32
|
||||||
|
# TODO: need help with 64-bit types in Python
|
||||||
else:
|
else:
|
||||||
print("Unknown type: "+str(type(val)))
|
print("Unknown type: "+str(type(val)))
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
@ -400,8 +404,8 @@ class GGUFWriter:
|
||||||
def write_header_to_file(self):
|
def write_header_to_file(self):
|
||||||
self.fout.write(struct.pack("<I", GGUF_MAGIC))
|
self.fout.write(struct.pack("<I", GGUF_MAGIC))
|
||||||
self.fout.write(struct.pack("<I", GGUF_VERSION))
|
self.fout.write(struct.pack("<I", GGUF_VERSION))
|
||||||
self.fout.write(struct.pack("<I", self.ti_data_count))
|
self.fout.write(struct.pack("<Q", self.ti_data_count))
|
||||||
self.fout.write(struct.pack("<I", self.kv_data_count))
|
self.fout.write(struct.pack("<Q", self.kv_data_count))
|
||||||
self.flush()
|
self.flush()
|
||||||
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
|
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
|
||||||
|
|
||||||
|
@ -444,6 +448,18 @@ class GGUFWriter:
|
||||||
self.add_key(key)
|
self.add_key(key)
|
||||||
self.add_val(val, GGUFValueType.FLOAT32)
|
self.add_val(val, GGUFValueType.FLOAT32)
|
||||||
|
|
||||||
|
def add_uint64(self, key: str, val: int):
|
||||||
|
self.add_key(key)
|
||||||
|
self.add_val(val, GGUFValueType.UINT64)
|
||||||
|
|
||||||
|
def add_int64(self, key: str, val: int):
|
||||||
|
self.add_key(key)
|
||||||
|
self.add_val(val, GGUFValueType.INT64)
|
||||||
|
|
||||||
|
def add_float64(self, key: str, val: float):
|
||||||
|
self.add_key(key)
|
||||||
|
self.add_val(val, GGUFValueType.FLOAT64)
|
||||||
|
|
||||||
def add_bool(self, key: str, val: bool):
|
def add_bool(self, key: str, val: bool):
|
||||||
self.add_key(key)
|
self.add_key(key)
|
||||||
self.add_val(val, GGUFValueType.BOOL)
|
self.add_val(val, GGUFValueType.BOOL)
|
||||||
|
@ -483,17 +499,23 @@ class GGUFWriter:
|
||||||
self.kv_data += struct.pack("<i", val)
|
self.kv_data += struct.pack("<i", val)
|
||||||
elif vtype == GGUFValueType.FLOAT32:
|
elif vtype == GGUFValueType.FLOAT32:
|
||||||
self.kv_data += struct.pack("<f", val)
|
self.kv_data += struct.pack("<f", val)
|
||||||
|
elif vtype == GGUFValueType.UINT64:
|
||||||
|
self.kv_data += struct.pack("<Q", val)
|
||||||
|
elif vtype == GGUFValueType.INT64:
|
||||||
|
self.kv_data += struct.pack("<q", val)
|
||||||
|
elif vtype == GGUFValueType.FLOAT64:
|
||||||
|
self.kv_data += struct.pack("<d", val)
|
||||||
elif vtype == GGUFValueType.BOOL:
|
elif vtype == GGUFValueType.BOOL:
|
||||||
self.kv_data += struct.pack("?", val)
|
self.kv_data += struct.pack("?", val)
|
||||||
elif vtype == GGUFValueType.STRING:
|
elif vtype == GGUFValueType.STRING:
|
||||||
encoded_val = val.encode("utf8") if isinstance(val, str) else val
|
encoded_val = val.encode("utf8") if isinstance(val, str) else val
|
||||||
self.kv_data += struct.pack("<I", len(encoded_val))
|
self.kv_data += struct.pack("<Q", len(encoded_val))
|
||||||
self.kv_data += encoded_val
|
self.kv_data += encoded_val
|
||||||
elif vtype == GGUFValueType.ARRAY:
|
elif vtype == GGUFValueType.ARRAY:
|
||||||
ltype = set([GGUFValueType.get_type(item) for item in val])
|
ltype = set([GGUFValueType.get_type(item) for item in val])
|
||||||
assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
|
assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
|
||||||
self.kv_data += struct.pack("<I", list(ltype)[0])
|
self.kv_data += struct.pack("<I", list(ltype)[0])
|
||||||
self.kv_data += struct.pack("<I", len(val))
|
self.kv_data += struct.pack("<Q", len(val))
|
||||||
for item in val:
|
for item in val:
|
||||||
self.add_val(item, add_vtype=False)
|
self.add_val(item, add_vtype=False)
|
||||||
else:
|
else:
|
||||||
|
@ -507,12 +529,12 @@ class GGUFWriter:
|
||||||
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
|
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
|
||||||
|
|
||||||
encoded_name = name.encode("utf8")
|
encoded_name = name.encode("utf8")
|
||||||
self.ti_data += struct.pack("<I", len(encoded_name))
|
self.ti_data += struct.pack("<Q", len(encoded_name))
|
||||||
self.ti_data += encoded_name
|
self.ti_data += encoded_name
|
||||||
n_dims = len(tensor_shape)
|
n_dims = len(tensor_shape)
|
||||||
self.ti_data += struct.pack("<I", n_dims)
|
self.ti_data += struct.pack("<I", n_dims)
|
||||||
for i in range(n_dims):
|
for i in range(n_dims):
|
||||||
self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
|
self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
|
||||||
if raw_dtype is None:
|
if raw_dtype is None:
|
||||||
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
|
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
|
||||||
else:
|
else:
|
||||||
|
|
470
llama.cpp
470
llama.cpp
|
@ -1,9 +1,6 @@
|
||||||
// Defines fileno on msys:
|
// Defines fileno on msys:
|
||||||
#ifndef _GNU_SOURCE
|
#ifndef _GNU_SOURCE
|
||||||
#define _GNU_SOURCE
|
#define _GNU_SOURCE
|
||||||
#include <cstddef>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <cstdio>
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
@ -62,6 +59,9 @@
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
|
#include <cstddef>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
@ -114,12 +114,17 @@ static size_t utf8_len(char src) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||||
for (size_t pos = 0; ; pos += replace.length()) {
|
std::string result;
|
||||||
pos = s.find(search, pos);
|
for (size_t pos = 0; ; pos += search.length()) {
|
||||||
if (pos == std::string::npos) break;
|
auto new_pos = s.find(search, pos);
|
||||||
s.erase(pos, search.length());
|
if (new_pos == std::string::npos) {
|
||||||
s.insert(pos, replace);
|
result += s.substr(pos, s.size() - pos);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
result += s.substr(pos, new_pos - pos) + replace;
|
||||||
|
pos = new_pos;
|
||||||
}
|
}
|
||||||
|
s = std::move(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void zeros(std::ofstream & file, size_t n) {
|
static void zeros(std::ofstream & file, size_t n) {
|
||||||
|
@ -796,12 +801,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
||||||
(void) tensor;
|
(void) tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string llama_token_to_text(const struct llama_context * ctx, llama_token token) {
|
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
||||||
std::vector<char> result(8, 0);
|
std::vector<char> result(8, 0);
|
||||||
const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
|
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
result.resize(-n_tokens);
|
result.resize(-n_tokens);
|
||||||
int check = llama_token_to_str(ctx, token, result.data(), result.size());
|
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||||
GGML_ASSERT(check == -n_tokens);
|
GGML_ASSERT(check == -n_tokens);
|
||||||
} else {
|
} else {
|
||||||
result.resize(n_tokens);
|
result.resize(n_tokens);
|
||||||
|
@ -955,10 +960,10 @@ struct llama_vocab {
|
||||||
id linefeed_id = 13;
|
id linefeed_id = 13;
|
||||||
|
|
||||||
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
||||||
replace_all(token_left, " ", "Ġ");
|
replace_all(token_left, " ", "\u0120");
|
||||||
replace_all(token_left, "\n", "Ċ");
|
replace_all(token_left, "\n", "\u010A");
|
||||||
replace_all(token_right, " ", "Ġ");
|
replace_all(token_right, " ", "\u0120");
|
||||||
replace_all(token_right, "\n", "Ċ");
|
replace_all(token_right, "\n", "\u010A");
|
||||||
|
|
||||||
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
||||||
if (it == bpe_ranks.end()) {
|
if (it == bpe_ranks.end()) {
|
||||||
|
@ -1144,11 +1149,13 @@ static bool llama_kv_cache_init(
|
||||||
|
|
||||||
enum llama_fver {
|
enum llama_fver {
|
||||||
GGUF_FILE_VERSION_V1 = 1,
|
GGUF_FILE_VERSION_V1 = 1,
|
||||||
|
GGUF_FILE_VERSION_V2 = 2,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * llama_file_version_name(llama_fver version) {
|
static const char * llama_file_version_name(llama_fver version) {
|
||||||
switch (version) {
|
switch (version) {
|
||||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
|
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
||||||
|
case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
|
||||||
}
|
}
|
||||||
|
|
||||||
return "unknown";
|
return "unknown";
|
||||||
|
@ -1635,7 +1642,8 @@ static void llm_load_hparams(
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: This should probably be in llama.h
|
// TODO: This should probably be in llama.h
|
||||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape);
|
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
|
||||||
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
||||||
|
|
||||||
static void llm_load_vocab(
|
static void llm_load_vocab(
|
||||||
llama_model_loader & ml,
|
llama_model_loader & ml,
|
||||||
|
@ -1737,7 +1745,11 @@ static void llm_load_vocab(
|
||||||
}
|
}
|
||||||
|
|
||||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||||
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0];
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
||||||
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
||||||
|
} else {
|
||||||
|
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
|
||||||
|
}
|
||||||
|
|
||||||
// special tokens
|
// special tokens
|
||||||
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
|
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
|
||||||
|
@ -2635,18 +2647,20 @@ static struct ggml_cgraph * llm_build_falcon(
|
||||||
|
|
||||||
const size_t wsize = ggml_type_size(cur->type);
|
const size_t wsize = ggml_type_size(cur->type);
|
||||||
|
|
||||||
struct ggml_tensor * tmpq = ggml_view_3d(
|
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
||||||
|
// non-contiguous views is added for the rope operator
|
||||||
|
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
||||||
ctx0, cur, n_embd_head, n_head, N,
|
ctx0, cur, n_embd_head, n_head, N,
|
||||||
wsize * n_embd_head,
|
wsize * n_embd_head,
|
||||||
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
||||||
0);
|
0));
|
||||||
offload_func_kq(tmpq);
|
offload_func_kq(tmpq);
|
||||||
|
|
||||||
struct ggml_tensor * tmpk = ggml_view_3d(
|
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
||||||
ctx0, cur, n_embd_head, n_head_kv, N,
|
ctx0, cur, n_embd_head, n_head_kv, N,
|
||||||
wsize * n_embd_head,
|
wsize * n_embd_head,
|
||||||
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
||||||
wsize * n_embd_head * n_head);
|
wsize * n_embd_head * n_head));
|
||||||
offload_func_kq(tmpk);
|
offload_func_kq(tmpk);
|
||||||
|
|
||||||
struct ggml_tensor * tmpv = ggml_view_3d(
|
struct ggml_tensor * tmpv = ggml_view_3d(
|
||||||
|
@ -2831,7 +2845,6 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
GGML_ASSERT(n_tokens > 0);
|
GGML_ASSERT(n_tokens > 0);
|
||||||
GGML_ASSERT(n_past >= 0);
|
GGML_ASSERT(n_past >= 0);
|
||||||
GGML_ASSERT(n_threads > 0);
|
|
||||||
// TODO: keep the values of n_batch and n_ctx
|
// TODO: keep the values of n_batch and n_ctx
|
||||||
// GGML_ASSERT(n_tokens <= n_batch);
|
// GGML_ASSERT(n_tokens <= n_batch);
|
||||||
// GGML_ASSERT(n_past + n_tokens <= n_ctx);
|
// GGML_ASSERT(n_past + n_tokens <= n_ctx);
|
||||||
|
@ -2842,6 +2855,8 @@ static bool llama_eval_internal(
|
||||||
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
GGML_ASSERT(n_threads > 0);
|
||||||
|
|
||||||
const int N = n_tokens;
|
const int N = n_tokens;
|
||||||
|
|
||||||
const auto & model = lctx.model;
|
const auto & model = lctx.model;
|
||||||
|
@ -3026,16 +3041,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
||||||
return vocab.token_to_id.at(buf);
|
return vocab.token_to_id.at(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string llama_escape_whitespace(const std::string& text) {
|
static void llama_escape_whitespace(std::string & text) {
|
||||||
std::string result = "\xe2\x96\x81";
|
replace_all(text, " ", "\xe2\x96\x81");
|
||||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
|
||||||
if (text[offs] == ' ') {
|
|
||||||
result += "\xe2\x96\x81";
|
|
||||||
} else {
|
|
||||||
result += text[offs];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_unescape_whitespace(std::string & word) {
|
static void llama_unescape_whitespace(std::string & word) {
|
||||||
|
@ -3219,7 +3226,7 @@ struct llm_bigram_bpe {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llm_tokenizer_bpe {
|
struct llm_tokenizer_bpe {
|
||||||
llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; }
|
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
|
||||||
|
|
||||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||||
int final_prev_index = -1;
|
int final_prev_index = -1;
|
||||||
|
@ -3371,8 +3378,6 @@ private:
|
||||||
return words;
|
return words;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool flag_g2ws = false;
|
|
||||||
|
|
||||||
const llama_vocab & vocab;
|
const llama_vocab & vocab;
|
||||||
|
|
||||||
std::vector<llm_symbol> symbols;
|
std::vector<llm_symbol> symbols;
|
||||||
|
@ -3381,9 +3386,18 @@ private:
|
||||||
llm_bigram_bpe::queue work_queue;
|
llm_bigram_bpe::queue work_queue;
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
|
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
|
||||||
std::vector<llama_vocab::id> output;
|
std::vector<llama_vocab::id> output;
|
||||||
|
|
||||||
|
// OG tokenizer behavior:
|
||||||
|
//
|
||||||
|
// tokenizer.encode('', add_bos=True) returns [1]
|
||||||
|
// tokenizer.encode('', add_bos=False) returns []
|
||||||
|
|
||||||
|
if (bos && vocab.special_bos_id != -1) {
|
||||||
|
output.push_back(vocab.special_bos_id);
|
||||||
|
}
|
||||||
|
|
||||||
if (raw_text.empty()) {
|
if (raw_text.empty()) {
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
@ -3391,29 +3405,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
switch (vocab.type) {
|
switch (vocab.type) {
|
||||||
case LLAMA_VOCAB_TYPE_SPM:
|
case LLAMA_VOCAB_TYPE_SPM:
|
||||||
{
|
{
|
||||||
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
||||||
|
raw_text = " " + raw_text;
|
||||||
|
|
||||||
llm_tokenizer_spm tokenizer(vocab);
|
llm_tokenizer_spm tokenizer(vocab);
|
||||||
|
llama_escape_whitespace(raw_text);
|
||||||
if (bos) {
|
tokenizer.tokenize(raw_text, output);
|
||||||
output.push_back(vocab.special_bos_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string text;
|
|
||||||
if (escape) {
|
|
||||||
text = llama_escape_whitespace(raw_text);
|
|
||||||
} else {
|
|
||||||
text = raw_text;
|
|
||||||
}
|
|
||||||
|
|
||||||
tokenizer.tokenize(text, output);
|
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_VOCAB_TYPE_BPE:
|
case LLAMA_VOCAB_TYPE_BPE:
|
||||||
{
|
{
|
||||||
llm_tokenizer_bpe tokenizer(vocab, escape);
|
llm_tokenizer_bpe tokenizer(vocab);
|
||||||
|
|
||||||
if (bos && vocab.special_bos_id != -1) {
|
|
||||||
output.push_back(vocab.special_bos_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
tokenizer.tokenize(raw_text, output);
|
tokenizer.tokenize(raw_text, output);
|
||||||
} break;
|
} break;
|
||||||
};
|
};
|
||||||
|
@ -3908,7 +3909,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
||||||
|
|
||||||
// Calculate absolute value of second derivatives
|
// Calculate absolute value of second derivatives
|
||||||
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
||||||
second_derivatives[i] = abs(second_derivatives[i]);
|
second_derivatives[i] = std::abs(second_derivatives[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Normalize the second derivatives
|
// Normalize the second derivatives
|
||||||
|
@ -4099,16 +4100,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||||
std::vector<llama_grammar_candidate> candidates_grammar;
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
const llama_token id = candidates->data[i].id;
|
const llama_token id = candidates->data[i].id;
|
||||||
const std::string text = llama_token_to_text(ctx, id);
|
const std::string piece = llama_token_to_str(ctx, id);
|
||||||
if (id == eos) {
|
if (id == eos) {
|
||||||
if (!allow_eos) {
|
if (!allow_eos) {
|
||||||
candidates->data[i].logit = -INFINITY;
|
candidates->data[i].logit = -INFINITY;
|
||||||
}
|
}
|
||||||
} else if (text.empty() || text[0] == 0) {
|
} else if (piece.empty() || piece[0] == 0) {
|
||||||
candidates->data[i].logit = -INFINITY;
|
candidates->data[i].logit = -INFINITY;
|
||||||
} else {
|
} else {
|
||||||
candidates_decoded.push_back(decode_utf8(text.c_str(), grammar->partial_utf8));
|
candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
|
||||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4312,10 +4313,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string text = llama_token_to_text(ctx, token);
|
const std::string piece = llama_token_to_str(ctx, token);
|
||||||
|
|
||||||
// Note terminating 0 in decoded string
|
// Note terminating 0 in decoded string
|
||||||
const auto decoded = decode_utf8(text.c_str(), grammar->partial_utf8);
|
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
||||||
const auto & code_points = decoded.first;
|
const auto & code_points = decoded.first;
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
||||||
|
@ -4326,6 +4327,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Beam search
|
||||||
|
//
|
||||||
|
|
||||||
|
struct llama_beam {
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
float p; // Cumulative beam probability (renormalized relative to all beams)
|
||||||
|
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
||||||
|
// Sort beams by probability. In case of ties, prefer beams at eob.
|
||||||
|
bool operator<(const llama_beam & rhs) const {
|
||||||
|
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
||||||
|
}
|
||||||
|
// Shift off first n tokens and discard them.
|
||||||
|
void shift_tokens(const size_t n) {
|
||||||
|
if (n) {
|
||||||
|
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
||||||
|
tokens.resize(tokens.size() - n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
||||||
|
};
|
||||||
|
|
||||||
|
// A struct for calculating logit-related info.
|
||||||
|
struct llama_logit_info {
|
||||||
|
const float * const logits;
|
||||||
|
const int n_vocab;
|
||||||
|
const float max_l;
|
||||||
|
const float normalizer;
|
||||||
|
struct sum_exp {
|
||||||
|
float max_l;
|
||||||
|
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
||||||
|
};
|
||||||
|
llama_logit_info(llama_context * ctx)
|
||||||
|
: logits(llama_get_logits(ctx))
|
||||||
|
, n_vocab(llama_n_vocab(ctx))
|
||||||
|
, max_l(*std::max_element(logits, logits + n_vocab))
|
||||||
|
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
||||||
|
{ }
|
||||||
|
llama_token_data get_token_data(const llama_token token_id) const {
|
||||||
|
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
||||||
|
return {token_id, logits[token_id], p};
|
||||||
|
}
|
||||||
|
// Return top k token_data by logit.
|
||||||
|
std::vector<llama_token_data> top_k(size_t k) {
|
||||||
|
std::vector<llama_token_data> min_heap; // min-heap by logit
|
||||||
|
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
||||||
|
min_heap.reserve(k_min);
|
||||||
|
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
||||||
|
min_heap.push_back(get_token_data(token_id));
|
||||||
|
}
|
||||||
|
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
||||||
|
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
||||||
|
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
||||||
|
if (min_heap.front().logit < logits[token_id]) {
|
||||||
|
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
||||||
|
min_heap.back().id = token_id;
|
||||||
|
min_heap.back().logit = logits[token_id];
|
||||||
|
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return min_heap;
|
||||||
|
}
|
||||||
|
float probability_from_logit(float logit) {
|
||||||
|
return normalizer * std::exp(logit - max_l);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_beam_search_data {
|
||||||
|
llama_context * ctx;
|
||||||
|
size_t n_beams;
|
||||||
|
int n_past;
|
||||||
|
int n_predict;
|
||||||
|
int n_threads;
|
||||||
|
std::vector<llama_beam> beams;
|
||||||
|
std::vector<llama_beam> next_beams;
|
||||||
|
|
||||||
|
// Re-calculated on each loop iteration
|
||||||
|
size_t common_prefix_length;
|
||||||
|
|
||||||
|
// Used to communicate to/from callback on beams state.
|
||||||
|
std::vector<llama_beam_view> beam_views;
|
||||||
|
|
||||||
|
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
|
||||||
|
: ctx(ctx)
|
||||||
|
, n_beams(n_beams)
|
||||||
|
, n_past(n_past)
|
||||||
|
, n_predict(n_predict)
|
||||||
|
, n_threads(n_threads)
|
||||||
|
, beam_views(n_beams) {
|
||||||
|
beams.reserve(n_beams);
|
||||||
|
next_beams.reserve(n_beams);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collapse beams to a single beam given by index.
|
||||||
|
void collapse_beams(const size_t beam_idx) {
|
||||||
|
if (0u < beam_idx) {
|
||||||
|
std::swap(beams[0], beams[beam_idx]);
|
||||||
|
}
|
||||||
|
beams.resize(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
||||||
|
// The repetative patterns below reflect the 2 stages of heaps:
|
||||||
|
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
||||||
|
// * If the heap is full and a new element is found that should be included, pop the
|
||||||
|
// least element to the back(), replace it with the new, then push it into the heap.
|
||||||
|
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
||||||
|
// Min-heaps use a greater-than comparator.
|
||||||
|
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
||||||
|
if (beam.eob) {
|
||||||
|
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
||||||
|
if (next_beams.size() < n_beams) {
|
||||||
|
next_beams.push_back(std::move(beam));
|
||||||
|
if (next_beams.size() == n_beams) {
|
||||||
|
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
}
|
||||||
|
} else if (next_beams.front().p < beam.p) {
|
||||||
|
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
next_beams.back() = std::move(beam);
|
||||||
|
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
||||||
|
if (!beam.tokens.empty()) {
|
||||||
|
llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
|
||||||
|
}
|
||||||
|
llama_logit_info logit_info(ctx);
|
||||||
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
||||||
|
size_t i=0;
|
||||||
|
if (next_beams.size() < n_beams) {
|
||||||
|
for (; next_beams.size() < n_beams ; ++i) {
|
||||||
|
llama_beam next_beam = beam;
|
||||||
|
next_beam.tokens.push_back(next_tokens[i].id);
|
||||||
|
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
||||||
|
next_beams.push_back(std::move(next_beam));
|
||||||
|
}
|
||||||
|
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
} else {
|
||||||
|
for (; next_beams.front().p == 0.0f ; ++i) {
|
||||||
|
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
next_beams.back() = beam;
|
||||||
|
next_beams.back().tokens.push_back(next_tokens[i].id);
|
||||||
|
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
||||||
|
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (; i < n_beams ; ++i) {
|
||||||
|
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
||||||
|
if (next_beams.front().p < next_p) {
|
||||||
|
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
next_beams.back() = beam;
|
||||||
|
next_beams.back().tokens.push_back(next_tokens[i].id);
|
||||||
|
next_beams.back().p = next_p;
|
||||||
|
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find common_prefix_length based on beams.
|
||||||
|
// Requires beams is not empty.
|
||||||
|
size_t find_common_prefix_length() {
|
||||||
|
size_t common_prefix_length = beams[0].tokens.size();
|
||||||
|
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
||||||
|
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
||||||
|
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
||||||
|
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
||||||
|
common_prefix_length = j;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return common_prefix_length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Construct beams_state to send back to caller via the callback function.
|
||||||
|
// Side effect: set common_prefix_length = find_common_prefix_length();
|
||||||
|
llama_beams_state get_beams_state(const bool last_call) {
|
||||||
|
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
||||||
|
beam_views[i] = beams[i].view();
|
||||||
|
}
|
||||||
|
common_prefix_length = find_common_prefix_length();
|
||||||
|
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loop:
|
||||||
|
// * while i < n_predict, AND
|
||||||
|
// * any of the beams have not yet reached end-of-beam (eob), AND
|
||||||
|
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
||||||
|
// (since all other beam probabilities can only decrease)
|
||||||
|
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
||||||
|
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
||||||
|
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
||||||
|
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
||||||
|
!beams[top_beam_index()].eob ; ++i) {
|
||||||
|
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
||||||
|
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
||||||
|
if (common_prefix_length) {
|
||||||
|
llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
|
||||||
|
n_past += common_prefix_length;
|
||||||
|
}
|
||||||
|
// Zero-out next_beam probabilities to place them last in following min-heap.
|
||||||
|
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
||||||
|
for (llama_beam & beam : beams) {
|
||||||
|
beam.shift_tokens(common_prefix_length);
|
||||||
|
fill_next_beams_by_top_probabilities(beam);
|
||||||
|
}
|
||||||
|
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
||||||
|
beams.swap(next_beams);
|
||||||
|
renormalize_beam_probabilities(beams);
|
||||||
|
}
|
||||||
|
collapse_beams(top_beam_index());
|
||||||
|
callback(callback_data, get_beams_state(true));
|
||||||
|
}
|
||||||
|
|
||||||
|
// As beams grow, the cumulative probabilities decrease.
|
||||||
|
// Renormalize them to avoid floating point underflow.
|
||||||
|
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
||||||
|
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
||||||
|
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
||||||
|
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
||||||
|
size_t top_beam_index() {
|
||||||
|
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy (p,eob) for each beam which may have been changed by the callback.
|
||||||
|
void update_beams_from_beam_views() {
|
||||||
|
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
||||||
|
beams[i].p = beam_views[i].p;
|
||||||
|
beams[i].eob = beam_views[i].eob;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
void llama_beam_search(llama_context * ctx,
|
||||||
|
llama_beam_search_callback_fn_t callback, void * callback_data,
|
||||||
|
size_t n_beams, int n_past, int n_predict, int n_threads) {
|
||||||
|
assert(ctx);
|
||||||
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
|
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
|
||||||
|
|
||||||
|
beam_search_data.loop(callback, callback_data);
|
||||||
|
|
||||||
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
|
ctx->n_sample++;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
@ -4423,6 +4675,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
||||||
|
|
||||||
|
llama_model model;
|
||||||
|
llm_load_arch(*ml, model);
|
||||||
|
llm_load_hparams(*ml, model, 0, 0, 0);
|
||||||
|
|
||||||
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
||||||
struct gguf_context * ctx_out = gguf_init_empty();
|
struct gguf_context * ctx_out = gguf_init_empty();
|
||||||
|
|
||||||
|
@ -4448,6 +4704,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
++n_feed_forward_w2;
|
++n_feed_forward_w2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
|
||||||
|
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
|
||||||
|
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
|
||||||
|
}
|
||||||
|
|
||||||
int i_attention_wv = 0;
|
int i_attention_wv = 0;
|
||||||
int i_feed_forward_w2 = 0;
|
int i_feed_forward_w2 = 0;
|
||||||
|
@ -4524,8 +4784,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||||
int nx = tensor->ne[0];
|
int nx = tensor->ne[0];
|
||||||
int ny = tensor->ne[1];
|
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||||
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
new_type = GGML_TYPE_Q8_0;
|
||||||
|
}
|
||||||
|
else if (new_type != GGML_TYPE_Q8_0) {
|
||||||
new_type = GGML_TYPE_Q6_K;
|
new_type = GGML_TYPE_Q6_K;
|
||||||
}
|
}
|
||||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||||
|
@ -4539,21 +4801,49 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||||
|
if (model.type == MODEL_70B) {
|
||||||
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||||
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||||
|
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||||
|
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
||||||
|
}
|
||||||
++i_attention_wv;
|
++i_attention_wv;
|
||||||
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||||
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
||||||
|
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
||||||
|
: GGML_TYPE_Q3_K;
|
||||||
|
}
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
||||||
|
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
||||||
|
}
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
||||||
|
if (model.arch == LLM_ARCH_FALCON) {
|
||||||
|
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
||||||
|
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||||
|
} else {
|
||||||
|
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
||||||
|
new_type = GGML_TYPE_Q5_K;
|
||||||
}
|
}
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
|
||||||
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
|
|
||||||
++i_feed_forward_w2;
|
++i_feed_forward_w2;
|
||||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
if (model.arch != LLM_ARCH_FALCON) {
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
|
} else {
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
||||||
}
|
}
|
||||||
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||||
|
@ -4568,8 +4858,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
||||||
int nx = tensor->ne[0];
|
int nx = tensor->ne[0];
|
||||||
int ny = tensor->ne[1];
|
int ny = tensor->ne[1];
|
||||||
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
if (nx % QK_K != 0) {
|
||||||
LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
||||||
convert_incompatible_tensor = true;
|
convert_incompatible_tensor = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5297,13 +5587,29 @@ int llama_model_n_embd(const struct llama_model * model) {
|
||||||
return model->hparams.n_embd;
|
return model->hparams.n_embd;
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||||
return snprintf(buf, buf_size, "%s %s %s",
|
return snprintf(buf, buf_size, "%s %s %s",
|
||||||
model->name.c_str(),
|
model->name.c_str(),
|
||||||
llama_model_type_name(model->type),
|
llama_model_type_name(model->type),
|
||||||
llama_model_ftype_name(model->ftype).c_str());
|
llama_model_ftype_name(model->ftype).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t llama_model_size(const struct llama_model * model) {
|
||||||
|
uint64_t size = 0;
|
||||||
|
for (const auto & it : model->tensors_by_name) {
|
||||||
|
size += ggml_nbytes(it.second);
|
||||||
|
}
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t llama_model_n_params(const struct llama_model * model) {
|
||||||
|
uint64_t nparams = 0;
|
||||||
|
for (const auto & it : model->tensors_by_name) {
|
||||||
|
nparams += ggml_nelements(it.second);
|
||||||
|
}
|
||||||
|
return nparams;
|
||||||
|
}
|
||||||
|
|
||||||
int llama_model_quantize(
|
int llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
const char * fname_out,
|
const char * fname_out,
|
||||||
|
@ -5828,8 +6134,7 @@ int llama_tokenize_with_model(
|
||||||
llama_token * tokens,
|
llama_token * tokens,
|
||||||
int n_max_tokens,
|
int n_max_tokens,
|
||||||
bool add_bos) {
|
bool add_bos) {
|
||||||
auto escape = llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM;
|
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
||||||
auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
|
|
||||||
|
|
||||||
if (n_max_tokens < (int) res.size()) {
|
if (n_max_tokens < (int) res.size()) {
|
||||||
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
||||||
|
@ -5843,12 +6148,12 @@ int llama_tokenize_with_model(
|
||||||
return res.size();
|
return res.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
||||||
return llama_token_to_str_with_model(&ctx->model, token, buf, length);
|
return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
// does not write null-terminator to str
|
// does not write null-terminator to buf
|
||||||
int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
||||||
if (0 <= token && token < llama_model_n_vocab(model)) {
|
if (0 <= token && token < llama_model_n_vocab(model)) {
|
||||||
if (llama_is_normal_token(model->vocab, token)) {
|
if (llama_is_normal_token(model->vocab, token)) {
|
||||||
std::string result = model->vocab.id_to_token[token].text;
|
std::string result = model->vocab.id_to_token[token].text;
|
||||||
|
@ -5936,6 +6241,7 @@ const char * llama_print_system_info(void) {
|
||||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||||
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
||||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||||
|
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
|
|
53
llama.h
53
llama.h
|
@ -254,7 +254,11 @@ extern "C" {
|
||||||
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
||||||
|
|
||||||
// Get a string describing the model type
|
// Get a string describing the model type
|
||||||
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
||||||
|
// Returns the total size of all the tensors in the model in bytes
|
||||||
|
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
||||||
|
// Returns the total number of parameters in the model
|
||||||
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
||||||
|
|
||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
LLAMA_API int llama_model_quantize(
|
LLAMA_API int llama_model_quantize(
|
||||||
|
@ -377,15 +381,17 @@ extern "C" {
|
||||||
int n_max_tokens,
|
int n_max_tokens,
|
||||||
bool add_bos);
|
bool add_bos);
|
||||||
|
|
||||||
// Token Id -> String. Uses the vocabulary in the provided context
|
// Token Id -> Piece.
|
||||||
// Does not write null terminator to the buffer
|
// Uses the vocabulary in the provided context.
|
||||||
LLAMA_API int llama_token_to_str(
|
// Does not write null terminator to the buffer.
|
||||||
|
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
||||||
|
LLAMA_API int llama_token_to_piece(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
llama_token token,
|
llama_token token,
|
||||||
char * buf,
|
char * buf,
|
||||||
int length);
|
int length);
|
||||||
|
|
||||||
LLAMA_API int llama_token_to_str_with_model(
|
LLAMA_API int llama_token_to_piece_with_model(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
llama_token token,
|
llama_token token,
|
||||||
char * buf,
|
char * buf,
|
||||||
|
@ -465,6 +471,43 @@ extern "C" {
|
||||||
/// @details Accepts the sampled token into the grammar
|
/// @details Accepts the sampled token into the grammar
|
||||||
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Beam search
|
||||||
|
//
|
||||||
|
|
||||||
|
struct llama_beam_view {
|
||||||
|
const llama_token * tokens;
|
||||||
|
size_t n_tokens;
|
||||||
|
float p; // Cumulative beam probability (renormalized relative to all beams)
|
||||||
|
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
||||||
|
};
|
||||||
|
|
||||||
|
// Passed to beam_search_callback function.
|
||||||
|
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
||||||
|
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
||||||
|
// These pointers are valid only during the synchronous callback, so should not be saved.
|
||||||
|
struct llama_beams_state {
|
||||||
|
struct llama_beam_view * beam_views;
|
||||||
|
size_t n_beams; // Number of elements in beam_views[].
|
||||||
|
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
||||||
|
bool last_call; // True iff this is the last callback invocation.
|
||||||
|
};
|
||||||
|
|
||||||
|
// Type of pointer to the beam_search_callback function.
|
||||||
|
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
||||||
|
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
||||||
|
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, llama_beams_state);
|
||||||
|
|
||||||
|
/// @details Deterministically returns entire sentence constructed by a beam search.
|
||||||
|
/// @param ctx Pointer to the llama_context.
|
||||||
|
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
|
||||||
|
/// @param callback_data A pointer that is simply passed back to callback.
|
||||||
|
/// @param n_beams Number of beams to use.
|
||||||
|
/// @param n_past Number of tokens already evaluated.
|
||||||
|
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
||||||
|
/// @param n_threads Number of threads as passed to llama_eval().
|
||||||
|
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
|
||||||
|
|
||||||
// Performance information
|
// Performance information
|
||||||
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
||||||
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
||||||
|
|
26
scripts/convert-gg.sh
Executable file
26
scripts/convert-gg.sh
Executable file
|
@ -0,0 +1,26 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# LLaMA v1
|
||||||
|
python3 convert.py ../llama1/7B --outfile models/llama-7b/ggml-model-f16.gguf --outtype f16
|
||||||
|
python3 convert.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16
|
||||||
|
python3 convert.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16
|
||||||
|
python3 convert.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16
|
||||||
|
|
||||||
|
# LLaMA v2
|
||||||
|
python3 convert.py ../llama2/llama-2-7b --outfile models/llama-7b-v2/ggml-model-f16.gguf --outtype f16
|
||||||
|
python3 convert.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16
|
||||||
|
python3 convert.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16
|
||||||
|
|
||||||
|
# Code Llama
|
||||||
|
python3 convert.py ../codellama/CodeLlama-7b/ --outfile models/codellama-7b/ggml-model-f16.gguf --outtype f16
|
||||||
|
python3 convert.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16
|
||||||
|
python3 convert.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16
|
||||||
|
|
||||||
|
# Falcon
|
||||||
|
python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b 1
|
||||||
|
mv -v ../falcon/falcon-7b/ggml-model-f16.gguf models/falcon-7b/ggml-model-f16.gguf
|
||||||
|
|
||||||
|
python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-40b 1
|
||||||
|
mv -v ../falcon/falcon-40b/ggml-model-f16.gguf models/falcon-40b/ggml-model-f16.gguf
|
|
@ -20,6 +20,8 @@ fi
|
||||||
model="$1"
|
model="$1"
|
||||||
out="../tmp/results-${model}"
|
out="../tmp/results-${model}"
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
mkdir -p ${out}
|
mkdir -p ${out}
|
||||||
|
|
||||||
for q in ${qnt[@]}; do
|
for q in ${qnt[@]}; do
|
||||||
|
|
|
@ -20,6 +20,8 @@ fi
|
||||||
model="$1"
|
model="$1"
|
||||||
out="../tmp/results-${model}"
|
out="../tmp/results-${model}"
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
mkdir -p ${out}
|
mkdir -p ${out}
|
||||||
|
|
||||||
mstr=""
|
mstr=""
|
||||||
|
|
|
@ -17,6 +17,8 @@ if [ ! -z "$3" ]; then
|
||||||
args="$3"
|
args="$3"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
model="$1"
|
model="$1"
|
||||||
out="../tmp/results-${model}"
|
out="../tmp/results-${model}"
|
||||||
|
|
||||||
|
|
|
@ -25,8 +25,10 @@ endfunction()
|
||||||
llama_build_and_test_executable(test-quantize-fns.cpp)
|
llama_build_and_test_executable(test-quantize-fns.cpp)
|
||||||
llama_build_and_test_executable(test-quantize-perf.cpp)
|
llama_build_and_test_executable(test-quantize-perf.cpp)
|
||||||
llama_build_and_test_executable(test-sampling.cpp)
|
llama_build_and_test_executable(test-sampling.cpp)
|
||||||
llama_build_executable(test-tokenizer-0.cpp)
|
llama_build_executable(test-tokenizer-0-llama.cpp)
|
||||||
llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
|
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||||
|
#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
llama_build_executable(test-tokenizer-1.cpp)
|
llama_build_executable(test-tokenizer-1.cpp)
|
||||||
# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
|
# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
|
||||||
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
|
|
178
tests/test-tokenizer-0-falcon.cpp
Normal file
178
tests/test-tokenizer-0-falcon.cpp
Normal file
|
@ -0,0 +1,178 @@
|
||||||
|
#include "llama.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
// generate using test-tokenizer-0-falcon.py
|
||||||
|
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||||
|
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||||
|
{ "" , { }, },
|
||||||
|
{ " " , { 204, }, },
|
||||||
|
{ " " , { 258, }, },
|
||||||
|
{ " " , { 466, }, },
|
||||||
|
{ "\t" , { 192, }, },
|
||||||
|
{ "\n" , { 193, }, },
|
||||||
|
{ "\t\n" , { 19125, }, },
|
||||||
|
{ "Hello world" , { 9856, 1079, }, },
|
||||||
|
{ " Hello world" , { 23090, 1079, }, },
|
||||||
|
{ "Hello World" , { 9856, 2889, }, },
|
||||||
|
{ " Hello World" , { 23090, 2889, }, },
|
||||||
|
{ " Hello World!" , { 23090, 2889, 12, }, },
|
||||||
|
{ "Hello, world!" , { 9856, 23, 1079, 12, }, },
|
||||||
|
{ " Hello, world!" , { 23090, 23, 1079, 12, }, },
|
||||||
|
{ " this is 🦙.cpp" , { 414, 304, 3346, 111, 231, 25, 29247, }, },
|
||||||
|
{ "w048 7tuijk dsdfhu" , { 98, 55866, 204, 34, 16682, 7149, 36190, 6869, 11481, }, },
|
||||||
|
{ "нещо на Български" , { 150, 133, 6207, 151, 215, 150, 134, 5052, 133, 6279, 5052, 223, 151, 216, 49679, 123, 53110, 47043, 7795, }, },
|
||||||
|
{ "កាន់តែពិសេសអាចខលចេញ" , { 38154, 206, 38154, 126, 38154, 225, 167, 237, 217, 38154, 221, 167, 237, 208, 38154, 228, 38154, 127, 38154, 237, 167, 237, 207, 38154, 237, 38154, 107, 38154, 126, 38154, 211, 38154, 207, 38154, 233, 38154, 211, 167, 237, 207, 38154, 215, }, },
|
||||||
|
{ "🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 2571, 232, 206, 204, 19, 11003, 20, 8196, 126, 283, 219, 48778, 116, 13392, 204, 19, 51831, 732, 63209, 1741, 7955, 522, 20, 22438, 211, 204, 19, 7927, 53360, 325, 504, 701, 946, 10930, 20, }, },
|
||||||
|
{ "Hello" , { 9856, }, },
|
||||||
|
{ " Hello" , { 23090, }, },
|
||||||
|
{ " Hello" , { 204, 23090, }, },
|
||||||
|
{ " Hello" , { 258, 23090, }, },
|
||||||
|
{ " Hello" , { 466, 23090, }, },
|
||||||
|
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
|
||||||
|
};
|
||||||
|
|
||||||
|
return _k_tests;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
if (argc < 2) {
|
||||||
|
fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string fname = argv[1];
|
||||||
|
|
||||||
|
std::string fname_text;
|
||||||
|
if (argc > 2) {
|
||||||
|
fname_text = argv[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
|
||||||
|
llama_backend_init(false);
|
||||||
|
|
||||||
|
// load the vocab
|
||||||
|
{
|
||||||
|
auto lparams = llama_context_default_params();
|
||||||
|
|
||||||
|
lparams.vocab_only = true;
|
||||||
|
|
||||||
|
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx = llama_new_context_with_model(model, lparams);
|
||||||
|
|
||||||
|
if (ctx == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_BPE) {
|
||||||
|
fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_free(ctx);
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool success = true;
|
||||||
|
|
||||||
|
for (const auto & test_kv : k_tests()) {
|
||||||
|
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
printf("src: '%s'\n", test_kv.first.c_str());
|
||||||
|
printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
|
||||||
|
printf("tok: ");
|
||||||
|
for (const auto & tok : res) {
|
||||||
|
printf("%d ", tok);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
bool correct = res.size() == test_kv.second.size();
|
||||||
|
|
||||||
|
for (int i = 0; i < (int) res.size() && correct; ++i) {
|
||||||
|
if (test_kv.second[i] != res[i]) {
|
||||||
|
correct = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!correct) {
|
||||||
|
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||||
|
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
||||||
|
llama_detokenize_bpe(ctx, res).c_str(),
|
||||||
|
llama_detokenize_bpe(ctx, test_kv.second).c_str());
|
||||||
|
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||||
|
for (const auto & t : test_kv.second) {
|
||||||
|
fprintf(stderr, "%6d, ", t);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "%s : got tokens: ", __func__);
|
||||||
|
for (const auto & t : res) {
|
||||||
|
fprintf(stderr, "%6d, ", t);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
success = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!fname_text.empty()) {
|
||||||
|
fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
|
||||||
|
|
||||||
|
std::string text;
|
||||||
|
{
|
||||||
|
std::ifstream ifs(fname_text);
|
||||||
|
if (!ifs) {
|
||||||
|
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
||||||
|
|
||||||
|
const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
||||||
|
|
||||||
|
{
|
||||||
|
const std::string fname_out = fname_text + ".tokcpp";
|
||||||
|
|
||||||
|
std::ofstream ofs(fname_out);
|
||||||
|
if (!ofs) {
|
||||||
|
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto & tok : res) {
|
||||||
|
ofs << tok << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
ofs << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_free(ctx);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return success ? 0 : 3;
|
||||||
|
}
|
83
tests/test-tokenizer-0-falcon.py
Normal file
83
tests/test-tokenizer-0-falcon.py
Normal file
|
@ -0,0 +1,83 @@
|
||||||
|
# tests with BPE tokenizer
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
|
||||||
|
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
dir_tokenizer = args.dir_tokenizer
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
|
||||||
|
|
||||||
|
tests = [
|
||||||
|
"",
|
||||||
|
" ",
|
||||||
|
" ",
|
||||||
|
" ",
|
||||||
|
"\t",
|
||||||
|
"\n",
|
||||||
|
"\t\n",
|
||||||
|
"Hello world",
|
||||||
|
" Hello world",
|
||||||
|
"Hello World",
|
||||||
|
" Hello World",
|
||||||
|
" Hello World!",
|
||||||
|
"Hello, world!",
|
||||||
|
" Hello, world!",
|
||||||
|
" this is 🦙.cpp",
|
||||||
|
"w048 7tuijk dsdfhu",
|
||||||
|
"нещо на Български",
|
||||||
|
"កាន់តែពិសេសអាចខលចេញ",
|
||||||
|
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||||
|
"Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello\n Hello",
|
||||||
|
]
|
||||||
|
|
||||||
|
for text in tests:
|
||||||
|
print('text: ', text)
|
||||||
|
print(tokenizer.encode(text))
|
||||||
|
print(tokenizer.decode(tokenizer.encode(text)))
|
||||||
|
|
||||||
|
print("\n\ntests for C++:\n")
|
||||||
|
for text in tests:
|
||||||
|
res = tokenizer.encode(text)
|
||||||
|
|
||||||
|
k = text.replace('\n', '\\n')
|
||||||
|
k = k.replace('\t', '\\t')
|
||||||
|
k = '"' + k + '"'
|
||||||
|
print("{ %-24s, { " % k, end='')
|
||||||
|
for x in res:
|
||||||
|
print("%7d," % x, end='')
|
||||||
|
print(" }, },")
|
||||||
|
|
||||||
|
print(tokenizer.encode('hello'))
|
||||||
|
print(tokenizer.encode('world'))
|
||||||
|
print(tokenizer.encode(' world'))
|
||||||
|
print(tokenizer.encode('hello world'))
|
||||||
|
|
||||||
|
fname_tok = args.fname_tok
|
||||||
|
if fname_tok:
|
||||||
|
print('tokenizing file: ', fname_tok)
|
||||||
|
fname_out = fname_tok + '.tok'
|
||||||
|
with open(fname_tok, 'r') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
s = ''.join(lines)
|
||||||
|
res = tokenizer.encode(s)
|
||||||
|
# write to file
|
||||||
|
with open(fname_out, 'w') as f:
|
||||||
|
for x in res:
|
||||||
|
f.write(str(x) + ' ')
|
||||||
|
f.write('\n')
|
||||||
|
print('len(res): ', len(res))
|
||||||
|
print('len(lines): ', len(lines))
|
||||||
|
print('results written to: ', fname_out)
|
182
tests/test-tokenizer-0-llama.cpp
Normal file
182
tests/test-tokenizer-0-llama.cpp
Normal file
|
@ -0,0 +1,182 @@
|
||||||
|
#include "llama.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
// generate using test-tokenizer-0-llama.py
|
||||||
|
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||||
|
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||||
|
{ "" , { }, },
|
||||||
|
{ " " , { 259, }, },
|
||||||
|
{ " " , { 1678, }, },
|
||||||
|
{ " " , { 268, }, },
|
||||||
|
{ "\t" , { 29871, 12, }, },
|
||||||
|
{ "\n" , { 29871, 13, }, },
|
||||||
|
{ "\t\n" , { 29871, 12, 13, }, },
|
||||||
|
{ "Hello world" , { 15043, 3186, }, },
|
||||||
|
{ " Hello world" , { 29871, 15043, 3186, }, },
|
||||||
|
{ "Hello World" , { 15043, 2787, }, },
|
||||||
|
{ " Hello World" , { 29871, 15043, 2787, }, },
|
||||||
|
{ " Hello World!" , { 29871, 15043, 2787, 29991, }, },
|
||||||
|
{ "Hello, world!" , { 15043, 29892, 3186, 29991, }, },
|
||||||
|
{ " Hello, world!" , { 29871, 15043, 29892, 3186, 29991, }, },
|
||||||
|
{ " this is 🦙.cpp" , { 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||||
|
{ "w048 7tuijk dsdfhu" , { 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||||
|
{ "нещо на Български" , { 1538, 4851, 665, 1386, 29713, 1305, }, },
|
||||||
|
{ "កាន់តែពិសេសអាចខលចេញ" , { 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, 136, 228, 162, 132, 228, 161, 140, }, },
|
||||||
|
{ "🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
|
||||||
|
{ "Hello" , { 15043, }, },
|
||||||
|
{ " Hello" , { 29871, 15043, }, },
|
||||||
|
{ " Hello" , { 259, 15043, }, },
|
||||||
|
{ " Hello" , { 1678, 15043, }, },
|
||||||
|
{ " Hello" , { 268, 15043, }, },
|
||||||
|
{ " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, },
|
||||||
|
};
|
||||||
|
|
||||||
|
return _k_tests;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
if (argc < 2) {
|
||||||
|
fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string fname = argv[1];
|
||||||
|
|
||||||
|
std::string fname_text;
|
||||||
|
if (argc > 2) {
|
||||||
|
fname_text = argv[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
|
||||||
|
llama_backend_init(false);
|
||||||
|
|
||||||
|
// load the vocab
|
||||||
|
{
|
||||||
|
auto lparams = llama_context_default_params();
|
||||||
|
|
||||||
|
lparams.vocab_only = true;
|
||||||
|
|
||||||
|
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx = llama_new_context_with_model(model, lparams);
|
||||||
|
|
||||||
|
if (ctx == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_SPM) {
|
||||||
|
fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_free(ctx);
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool success = true;
|
||||||
|
|
||||||
|
for (const auto & test_kv : k_tests()) {
|
||||||
|
const std::vector<llama_token> res_bos = llama_tokenize(ctx, test_kv.first, true);
|
||||||
|
const std::vector<llama_token> res_nobos = llama_tokenize(ctx, test_kv.first, false);
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
printf("src: '%s'\n", test_kv.first.c_str());
|
||||||
|
printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str());
|
||||||
|
printf("tok: ");
|
||||||
|
for (const auto & tok : res_bos) {
|
||||||
|
printf("%d ", tok);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1;
|
||||||
|
|
||||||
|
for (int i = 0; i < (int) res_nobos.size() && correct; ++i) {
|
||||||
|
if (test_kv.second[i] != res_bos[i + 1]) {
|
||||||
|
correct = false;
|
||||||
|
}
|
||||||
|
if (test_kv.second[i] != res_nobos[i]) {
|
||||||
|
correct = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!correct) {
|
||||||
|
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||||
|
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
||||||
|
llama_detokenize_spm(ctx, res_nobos).c_str(),
|
||||||
|
llama_detokenize_spm(ctx, test_kv.second).c_str());
|
||||||
|
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||||
|
for (const auto & t : test_kv.second) {
|
||||||
|
fprintf(stderr, "%6d, ", t);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "%s : got tokens: ", __func__);
|
||||||
|
for (const auto & t : res_nobos) {
|
||||||
|
fprintf(stderr, "%6d, ", t);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
success = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!fname_text.empty()) {
|
||||||
|
fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
|
||||||
|
|
||||||
|
std::string text;
|
||||||
|
{
|
||||||
|
std::ifstream ifs(fname_text);
|
||||||
|
if (!ifs) {
|
||||||
|
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
||||||
|
|
||||||
|
const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
||||||
|
|
||||||
|
{
|
||||||
|
const std::string fname_out = fname_text + ".tokcpp";
|
||||||
|
|
||||||
|
std::ofstream ofs(fname_out);
|
||||||
|
if (!ofs) {
|
||||||
|
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto & tok : res) {
|
||||||
|
ofs << tok << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
ofs << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_free(ctx);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return success ? 0 : 3;
|
||||||
|
}
|
95
tests/test-tokenizer-0-llama.py
Normal file
95
tests/test-tokenizer-0-llama.py
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
# tests with SPM tokenizer
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
|
||||||
|
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
dir_tokenizer = args.dir_tokenizer
|
||||||
|
|
||||||
|
tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
|
||||||
|
|
||||||
|
tests = [
|
||||||
|
"",
|
||||||
|
" ",
|
||||||
|
" ",
|
||||||
|
" ",
|
||||||
|
"\t",
|
||||||
|
"\n",
|
||||||
|
"\t\n",
|
||||||
|
"Hello world",
|
||||||
|
" Hello world",
|
||||||
|
"Hello World",
|
||||||
|
" Hello World",
|
||||||
|
" Hello World!",
|
||||||
|
"Hello, world!",
|
||||||
|
" Hello, world!",
|
||||||
|
" this is 🦙.cpp",
|
||||||
|
"w048 7tuijk dsdfhu",
|
||||||
|
"нещо на Български",
|
||||||
|
"កាន់តែពិសេសអាចខលចេញ",
|
||||||
|
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||||
|
"Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello\n Hello",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
for text in tests:
|
||||||
|
print('text: ', text)
|
||||||
|
print('\nwith bos:')
|
||||||
|
print(tokenizer.encode(text, add_bos=True))
|
||||||
|
print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))
|
||||||
|
print('\nwithout bos:')
|
||||||
|
print(tokenizer.encode(text, add_bos=False))
|
||||||
|
print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))
|
||||||
|
|
||||||
|
print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello'
|
||||||
|
print("'" + tokenizer.id_to_piece(29871) + "'") # '_'
|
||||||
|
print("'" + tokenizer.decode([15043]) + "'") # 'Hello'
|
||||||
|
print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
|
||||||
|
print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello'
|
||||||
|
print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello'
|
||||||
|
|
||||||
|
print("\n\ntests for C++:\n")
|
||||||
|
for text in tests:
|
||||||
|
res = tokenizer.encode(text, add_bos=False)
|
||||||
|
|
||||||
|
k = text.replace('\n', '\\n')
|
||||||
|
k = k.replace('\t', '\\t')
|
||||||
|
k = '"' + k + '"'
|
||||||
|
print("{ %-24s, { " % k, end='')
|
||||||
|
for x in res:
|
||||||
|
print("%7d," % x, end='')
|
||||||
|
print(" }, },")
|
||||||
|
|
||||||
|
print(tokenizer.encode('hello'))
|
||||||
|
print(tokenizer.encode('world'))
|
||||||
|
print(tokenizer.encode(' world'))
|
||||||
|
print(tokenizer.encode('hello world'))
|
||||||
|
|
||||||
|
fname_tok = args.fname_tok
|
||||||
|
if fname_tok:
|
||||||
|
print('tokenizing file: ', fname_tok)
|
||||||
|
fname_out = fname_tok + '.tok'
|
||||||
|
with open(fname_tok, 'r') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
s = ''.join(lines)
|
||||||
|
res = tokenizer.encode(s, add_bos=True)
|
||||||
|
# write to file
|
||||||
|
with open(fname_out, 'w') as f:
|
||||||
|
for x in res:
|
||||||
|
f.write(str(x) + ' ')
|
||||||
|
f.write('\n')
|
||||||
|
print('len(res): ', len(res))
|
||||||
|
print('len(lines): ', len(lines))
|
||||||
|
print('results written to: ', fname_out)
|
|
@ -1,140 +0,0 @@
|
||||||
#include "llama.h"
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <string>
|
|
||||||
#include <map>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) {
|
|
||||||
std::string result;
|
|
||||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
||||||
result += llama_token_to_str(ctx, tokens[i]);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
|
||||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
|
||||||
{ " ", {1, 259, }, },
|
|
||||||
{ " ", { 1, 1678, }, },
|
|
||||||
{ " ", { 1, 268, }, },
|
|
||||||
{ "\t", { 1, 29871, 12, }, },
|
|
||||||
{ "\n", { 1, 29871, 13, }, },
|
|
||||||
{ "\t\n", { 1, 29871, 12, 13, }, },
|
|
||||||
{ "Hello world", { 1, 15043, 3186, }, },
|
|
||||||
{ " Hello world", { 1, 29871, 15043, 3186, }, },
|
|
||||||
{ "Hello World", { 1, 15043, 2787, }, },
|
|
||||||
{ " Hello World", { 1, 29871, 15043, 2787, }, },
|
|
||||||
{ " Hello World!", { 1, 29871, 15043, 2787, 29991, }, },
|
|
||||||
{ " this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
|
||||||
{ "w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
|
||||||
{ "нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, },
|
|
||||||
{ "កាន់តែពិសេសអាចខលចេញ", { 1, 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161,
|
|
||||||
146, 228, 162, 133, 228, 161, 153, 228, 161, 186,
|
|
||||||
31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228,
|
|
||||||
161, 136, 228, 161, 132, 228, 161, 158, 228, 161,
|
|
||||||
136, 228, 162, 132, 228, 161, 140, }, },
|
|
||||||
{ "🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
|
||||||
{ 1, 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871,
|
|
||||||
243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
|
|
||||||
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
|
|
||||||
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
|
|
||||||
{ "Hello", { 1, 15043 }, },
|
|
||||||
{ " Hello", { 1, 29871, 15043 }, },
|
|
||||||
{ " Hello", { 1, 259, 15043 }, },
|
|
||||||
{ " Hello", { 1, 1678, 15043 }, },
|
|
||||||
{ " Hello", { 1, 268, 15043 }, },
|
|
||||||
{ " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, },
|
|
||||||
};
|
|
||||||
|
|
||||||
return _k_tests;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
|
||||||
if (argc < 2) {
|
|
||||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string fname = argv[1];
|
|
||||||
|
|
||||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
|
||||||
|
|
||||||
llama_model * model;
|
|
||||||
llama_context * ctx;
|
|
||||||
|
|
||||||
llama_backend_init(false);
|
|
||||||
|
|
||||||
// load the vocab
|
|
||||||
{
|
|
||||||
auto lparams = llama_context_default_params();
|
|
||||||
|
|
||||||
lparams.vocab_only = true;
|
|
||||||
|
|
||||||
model = llama_load_model_from_file(fname.c_str(), lparams);
|
|
||||||
|
|
||||||
if (model == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx = llama_new_context_with_model(model, lparams);
|
|
||||||
|
|
||||||
if (ctx == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
|
||||||
llama_free_model(model);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(ctx);
|
|
||||||
|
|
||||||
if (n_vocab != 32000) {
|
|
||||||
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
|
|
||||||
llama_free_model(model);
|
|
||||||
llama_free(ctx);
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool success = true;
|
|
||||||
|
|
||||||
for (const auto & test_kv : k_tests()) {
|
|
||||||
std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
|
|
||||||
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
|
|
||||||
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());
|
|
||||||
|
|
||||||
bool correct = res.size() == test_kv.second.size();
|
|
||||||
|
|
||||||
for (int i = 0; i < (int) res.size() && correct; ++i) {
|
|
||||||
if (res[i] != test_kv.second[i]) {
|
|
||||||
correct = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!correct) {
|
|
||||||
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
|
||||||
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
|
||||||
unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str());
|
|
||||||
fprintf(stderr, "%s : expected tokens: ", __func__);
|
|
||||||
for (const auto & t : test_kv.second) {
|
|
||||||
fprintf(stderr, "%6d, ", t);
|
|
||||||
}
|
|
||||||
fprintf(stderr, "\n");
|
|
||||||
fprintf(stderr, "%s : got tokens: ", __func__);
|
|
||||||
for (const auto & t : res) {
|
|
||||||
fprintf(stderr, "%6d, ", t);
|
|
||||||
}
|
|
||||||
fprintf(stderr, "\n");
|
|
||||||
|
|
||||||
success = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_free_model(model);
|
|
||||||
llama_free(ctx);
|
|
||||||
|
|
||||||
llama_backend_free();
|
|
||||||
|
|
||||||
return success ? 0 : 3;
|
|
||||||
}
|
|
|
@ -22,14 +22,6 @@ static std::string escape_whitespace(const std::string& text) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string unescape_whitespace(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
|
||||||
std::string result;
|
|
||||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
||||||
result += llama_token_to_str(ctx, tokens[i]);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
if (argc < 2) {
|
if (argc < 2) {
|
||||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||||
|
@ -72,13 +64,13 @@ int main(int argc, char **argv) {
|
||||||
const int n_vocab = llama_n_vocab(ctx);
|
const int n_vocab = llama_n_vocab(ctx);
|
||||||
|
|
||||||
for (int i = 0; i < n_vocab; ++i) {
|
for (int i = 0; i < n_vocab; ++i) {
|
||||||
std::string forward = llama_token_to_str(ctx, i);
|
std::string forward = llama_token_to_piece(ctx, i);
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
|
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
|
||||||
if (tokens.size() == 1) {
|
if (tokens.size() == 1) {
|
||||||
if (i != tokens[0]) {
|
if (i != tokens[0]) {
|
||||||
std::string backward = llama_token_to_str(ctx, tokens[0]);
|
std::string backward = llama_token_to_piece(ctx, tokens[0]);
|
||||||
fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
|
fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
|
||||||
__func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
|
__func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str());
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue