Reworking rope WIP
This commit is contained in:
commit
374fffb9c6
24 changed files with 600 additions and 256 deletions
6
.gitignore
vendored
6
.gitignore
vendored
|
@ -16,6 +16,8 @@ build/
|
||||||
build-em/
|
build-em/
|
||||||
build-debug/
|
build-debug/
|
||||||
build-release/
|
build-release/
|
||||||
|
build-ci-debug/
|
||||||
|
build-ci-release/
|
||||||
build-static/
|
build-static/
|
||||||
build-cublas/
|
build-cublas/
|
||||||
build-opencl/
|
build-opencl/
|
||||||
|
@ -25,6 +27,10 @@ build-no-accel/
|
||||||
build-sanitize-addr/
|
build-sanitize-addr/
|
||||||
build-sanitize-thread/
|
build-sanitize-thread/
|
||||||
out/
|
out/
|
||||||
|
tmp/
|
||||||
|
|
||||||
|
models/*
|
||||||
|
models-mnt
|
||||||
|
|
||||||
/main
|
/main
|
||||||
/quantize
|
/quantize
|
||||||
|
|
20
ci/README.md
Normal file
20
ci/README.md
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# CI
|
||||||
|
|
||||||
|
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
|
||||||
|
|
||||||
|
https://github.com/ggml-org/ci
|
||||||
|
|
||||||
|
It monitors the `master` branch for new commits and runs the
|
||||||
|
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
|
||||||
|
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
|
||||||
|
to cover various hardware architectures, including GPU and Apple Silicon instances.
|
||||||
|
|
||||||
|
Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
|
||||||
|
Only the branches of this repo are monitored for this keyword.
|
||||||
|
|
||||||
|
It is a good practice, before publishing changes to execute the full CI locally on your machine:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir tmp
|
||||||
|
bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
|
```
|
262
ci/run.sh
Normal file
262
ci/run.sh
Normal file
|
@ -0,0 +1,262 @@
|
||||||
|
#/bin/bash
|
||||||
|
|
||||||
|
if [ -z "$2" ]; then
|
||||||
|
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p "$1"
|
||||||
|
mkdir -p "$2"
|
||||||
|
|
||||||
|
OUT=$(realpath "$1")
|
||||||
|
MNT=$(realpath "$2")
|
||||||
|
|
||||||
|
rm -v $OUT/*.log
|
||||||
|
rm -v $OUT/*.exit
|
||||||
|
rm -v $OUT/*.md
|
||||||
|
|
||||||
|
sd=`dirname $0`
|
||||||
|
cd $sd/../
|
||||||
|
SRC=`pwd`
|
||||||
|
|
||||||
|
## helpers
|
||||||
|
|
||||||
|
# download a file if it does not exist or if it is outdated
|
||||||
|
function gg_wget {
|
||||||
|
local out=$1
|
||||||
|
local url=$2
|
||||||
|
|
||||||
|
local cwd=`pwd`
|
||||||
|
|
||||||
|
mkdir -p $out
|
||||||
|
cd $out
|
||||||
|
|
||||||
|
# should not re-download if file is the same
|
||||||
|
wget -nv -N $url
|
||||||
|
|
||||||
|
cd $cwd
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_printf {
|
||||||
|
printf -- "$@" >> $OUT/README.md
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_run {
|
||||||
|
ci=$1
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
set -x
|
||||||
|
|
||||||
|
gg_run_$ci | tee $OUT/$ci.log
|
||||||
|
cur=$?
|
||||||
|
echo "$cur" > $OUT/$ci.exit
|
||||||
|
|
||||||
|
set +x
|
||||||
|
set +o pipefail
|
||||||
|
|
||||||
|
gg_sum_$ci
|
||||||
|
|
||||||
|
ret=$((ret | cur))
|
||||||
|
}
|
||||||
|
|
||||||
|
## ci
|
||||||
|
|
||||||
|
# ctest_debug
|
||||||
|
|
||||||
|
function gg_run_ctest_debug {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_ctest_debug {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Runs ctest in debug mode\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
gg_printf '\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
# ctest_release
|
||||||
|
|
||||||
|
function gg_run_ctest_release {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
if [ -z $GG_BUILD_LOW_PERF ]; then
|
||||||
|
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
else
|
||||||
|
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
fi
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_ctest_release {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Runs ctest in release mode\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||||
|
gg_printf '```\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
# open_llama_3b_v2
|
||||||
|
|
||||||
|
function gg_run_open_llama_3b_v2 {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
||||||
|
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
||||||
|
|
||||||
|
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
||||||
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
|
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||||
|
|
||||||
|
path_models="../models-mnt/open-llama/3B-v2"
|
||||||
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
|
|
||||||
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
python3 ../convert.py ${path_models}
|
||||||
|
|
||||||
|
model_f16="${path_models}/ggml-model-f16.bin"
|
||||||
|
model_q8_0="${path_models}/ggml-model-q8_0.bin"
|
||||||
|
model_q4_0="${path_models}/ggml-model-q4_0.bin"
|
||||||
|
model_q4_1="${path_models}/ggml-model-q4_1.bin"
|
||||||
|
model_q5_0="${path_models}/ggml-model-q5_0.bin"
|
||||||
|
model_q5_1="${path_models}/ggml-model-q5_1.bin"
|
||||||
|
model_q3_k="${path_models}/ggml-model-q3_k.bin"
|
||||||
|
model_q4_k="${path_models}/ggml-model-q4_k.bin"
|
||||||
|
model_q5_k="${path_models}/ggml-model-q5_k.bin"
|
||||||
|
model_q6_k="${path_models}/ggml-model-q6_k.bin"
|
||||||
|
|
||||||
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
|
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
|
(time ./bin/main --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
function check_ppl {
|
||||||
|
qnt="$1"
|
||||||
|
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
|
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||||
|
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||||
|
return 20
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_open_llama_3b_v2 {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'OpenLLaMA 3B-v2:\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||||
|
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||||
|
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||||
|
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||||
|
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||||
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
|
}
|
||||||
|
|
||||||
|
## main
|
||||||
|
|
||||||
|
if [ -z $GG_BUILD_LOW_PERF ]; then
|
||||||
|
rm -rf ${SRC}/models-mnt
|
||||||
|
|
||||||
|
mnt_models=$(realpath ${MNT}/models)
|
||||||
|
mkdir -p ${mnt_models}
|
||||||
|
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||||
|
|
||||||
|
python3 -m pip install -r ${SRC}/requirements.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
ret=0
|
||||||
|
|
||||||
|
#test $ret -eq 0 && gg_run ctest_debug
|
||||||
|
#test $ret -eq 0 && gg_run ctest_release
|
||||||
|
|
||||||
|
if [ -z $GG_BUILD_LOW_PERF ]; then
|
||||||
|
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit $ret
|
|
@ -168,6 +168,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_ctx = std::stoi(argv[i]);
|
params.n_ctx = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--rope-freq-base") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.rope_freq_base = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--rope-freq-scale") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.rope_freq_scale = std::stof(argv[i]);
|
||||||
} else if (arg == "--memory-f32") {
|
} else if (arg == "--memory-f32") {
|
||||||
params.memory_f16 = false;
|
params.memory_f16 = false;
|
||||||
} else if (arg == "--top-p") {
|
} else if (arg == "--top-p") {
|
||||||
|
@ -267,6 +279,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.n_keep = std::stoi(argv[i]);
|
params.n_keep = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--chunks") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_chunks = std::stoi(argv[i]);
|
||||||
} else if (arg == "-m" || arg == "--model") {
|
} else if (arg == "-m" || arg == "--model") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -493,6 +511,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
||||||
fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
|
fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
|
||||||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
|
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||||
|
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||||
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||||
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
|
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
|
||||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
|
@ -501,6 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
||||||
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||||
|
fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||||
if (llama_mlock_supported()) {
|
if (llama_mlock_supported()) {
|
||||||
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
}
|
}
|
||||||
|
@ -573,6 +594,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
lparams.use_mlock = params.use_mlock;
|
lparams.use_mlock = params.use_mlock;
|
||||||
lparams.logits_all = params.perplexity;
|
lparams.logits_all = params.perplexity;
|
||||||
lparams.embedding = params.embedding;
|
lparams.embedding = params.embedding;
|
||||||
|
lparams.rope_freq_base = params.rope_freq_base;
|
||||||
|
lparams.rope_freq_scale = params.rope_freq_scale;
|
||||||
|
|
||||||
return lparams;
|
return lparams;
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,10 +28,13 @@ struct gpt_params {
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 512; // context size
|
||||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
|
float rope_freq_base = 10000.0f; // RoPE base frequency
|
||||||
|
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||||
|
|
|
@ -84,9 +84,17 @@ int main(int argc, char ** argv) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.rope_freq_base != 10000.0) {
|
||||||
|
fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.rope_freq_scale != 1.0) {
|
||||||
|
fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
|
||||||
|
}
|
||||||
|
|
||||||
if (params.n_ctx > 2048) {
|
if (params.n_ctx > 2048) {
|
||||||
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);"
|
||||||
"expect poor results\n", __func__, params.n_ctx);
|
" you are on your own\n", __func__, params.n_ctx);
|
||||||
} else if (params.n_ctx < 8) {
|
} else if (params.n_ctx < 8) {
|
||||||
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||||
params.n_ctx = 8;
|
params.n_ctx = 8;
|
||||||
|
|
|
@ -32,13 +32,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
int count = 0;
|
const int n_chunk_max = tokens.size() / params.n_ctx;
|
||||||
|
|
||||||
const int n_chunk = tokens.size() / params.n_ctx;
|
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
||||||
const int n_vocab = llama_n_vocab(ctx);
|
const int n_vocab = llama_n_vocab(ctx);
|
||||||
const int n_batch = params.n_batch;
|
const int n_batch = params.n_batch;
|
||||||
|
|
||||||
|
int count = 0;
|
||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
|
|
||||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||||
|
|
||||||
for (int i = 0; i < n_chunk; ++i) {
|
for (int i = 0; i < n_chunk; ++i) {
|
||||||
|
|
|
@ -12,103 +12,27 @@ struct quant_option {
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{
|
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", },
|
||||||
"Q4_0",
|
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", },
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_0,
|
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", },
|
||||||
" 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
|
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", },
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q4_1",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_1,
|
|
||||||
" 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q5_0",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_0,
|
|
||||||
" 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q5_1",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_1,
|
|
||||||
" 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
|
|
||||||
},
|
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
{
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", },
|
||||||
"Q2_K",
|
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||||
LLAMA_FTYPE_MOSTLY_Q2_K,
|
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
|
||||||
" 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
|
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
|
||||||
},
|
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
|
||||||
{
|
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||||
"Q3_K",
|
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_M,
|
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
|
||||||
"alias for Q3_K_M"
|
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||||
},
|
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
|
||||||
{
|
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
|
||||||
"Q3_K_S",
|
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", },
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_S,
|
|
||||||
" 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q3_K_M",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_M,
|
|
||||||
" 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q3_K_L",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_L,
|
|
||||||
" 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q4_K",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_K_M,
|
|
||||||
"alias for Q4_K_M",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q4_K_S",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_K_S,
|
|
||||||
" 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q4_K_M",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_K_M,
|
|
||||||
" 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q5_K",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_M,
|
|
||||||
"alias for Q5_K_M",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q5_K_S",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_S,
|
|
||||||
" 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q5_K_M",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K_M,
|
|
||||||
" 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Q6_K",
|
|
||||||
LLAMA_FTYPE_MOSTLY_Q6_K,
|
|
||||||
" 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
|
|
||||||
},
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", },
|
||||||
"Q8_0",
|
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||||
LLAMA_FTYPE_MOSTLY_Q8_0,
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||||
" 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"F16",
|
|
||||||
LLAMA_FTYPE_MOSTLY_F16,
|
|
||||||
"13.00G @ 7B - extremely large, virtually no quality loss - not recommended",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"F32",
|
|
||||||
LLAMA_FTYPE_ALL_F32,
|
|
||||||
"26.00G @ 7B - absolutely huge, lossless - not recommended",
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -66,6 +66,7 @@ Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the
|
||||||
```sh
|
```sh
|
||||||
curl --request POST \
|
curl --request POST \
|
||||||
--url http://localhost:8080/completion \
|
--url http://localhost:8080/completion \
|
||||||
|
--header "Content-Type: application/json" \
|
||||||
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
|
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -32,6 +32,7 @@ tokenize() {
|
||||||
--silent \
|
--silent \
|
||||||
--request POST \
|
--request POST \
|
||||||
--url "${API_URL}/tokenize" \
|
--url "${API_URL}/tokenize" \
|
||||||
|
--header "Content-Type: application/json" \
|
||||||
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
|
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
|
||||||
| jq '.tokens[]'
|
| jq '.tokens[]'
|
||||||
}
|
}
|
||||||
|
@ -64,6 +65,7 @@ chat_completion() {
|
||||||
--no-buffer \
|
--no-buffer \
|
||||||
--request POST \
|
--request POST \
|
||||||
--url "${API_URL}/completion" \
|
--url "${API_URL}/completion" \
|
||||||
|
--header "Content-Type: application/json" \
|
||||||
--data-raw "${DATA}")
|
--data-raw "${DATA}")
|
||||||
|
|
||||||
printf "\n"
|
printf "\n"
|
||||||
|
|
|
@ -608,6 +608,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
|
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||||
|
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
|
@ -722,6 +724,22 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.n_ctx = std::stoi(argv[i]);
|
params.n_ctx = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
|
else if (arg == "--rope-freq-base")
|
||||||
|
{
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.rope_freq_base = std::stof(argv[i]);
|
||||||
|
}
|
||||||
|
else if (arg == "--rope-freq-scale")
|
||||||
|
{
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.rope_freq_scale = std::stof(argv[i]);
|
||||||
|
}
|
||||||
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
||||||
{
|
{
|
||||||
params.memory_f16 = false;
|
params.memory_f16 = false;
|
||||||
|
|
3
expose.h
3
expose.h
|
@ -36,7 +36,8 @@ struct load_model_inputs
|
||||||
const int debugmode = 0;
|
const int debugmode = 0;
|
||||||
const int forceversion = 0;
|
const int forceversion = 0;
|
||||||
const int gpulayers = 0;
|
const int gpulayers = 0;
|
||||||
const bool linear_rope;
|
const float rope_freq_scale = 1.0f;
|
||||||
|
const float rope_freq_base = 10000.0f;
|
||||||
const char * banned_tokens[ban_token_max];
|
const char * banned_tokens[ban_token_max];
|
||||||
};
|
};
|
||||||
struct generation_inputs
|
struct generation_inputs
|
||||||
|
|
25
ggml-cuda.cu
25
ggml-cuda.cu
|
@ -2977,10 +2977,8 @@ inline void ggml_cuda_op_rope(
|
||||||
const int mode = ((int32_t *) src1->data)[2];
|
const int mode = ((int32_t *) src1->data)[2];
|
||||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||||
|
|
||||||
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||||
const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02);
|
const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
|
||||||
|
|
||||||
const float p = get_ntk_rope_scale_mode()?p0:(n_ctx <= GGML_TRAINING_CTX ? p0 : p0 * GGML_TRAINING_CTX / n_ctx);
|
|
||||||
|
|
||||||
bool is_glm = mode & 4;
|
bool is_glm = mode & 4;
|
||||||
|
|
||||||
|
@ -3564,6 +3562,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
||||||
(void) dst;
|
(void) dst;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
ggml_cuda_cpy(src0, dst, nullptr);
|
||||||
|
(void) src1;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
||||||
|
@ -3697,7 +3700,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||||
// recursively assign CUDA buffers until a compute tensor is found
|
// recursively assign CUDA buffers until a compute tensor is found
|
||||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
||||||
const ggml_op src0_op = tensor->src[0]->op;
|
const ggml_op src0_op = tensor->src[0]->op;
|
||||||
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
||||||
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3803,6 +3806,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
|
case GGML_OP_DUP:
|
||||||
|
if (!any_on_device) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
func = ggml_cuda_dup;
|
||||||
|
break;
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
if (!any_on_device) {
|
if (!any_on_device) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -3857,6 +3866,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
}
|
}
|
||||||
func = ggml_cuda_cpy;
|
func = ggml_cuda_cpy;
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_CONT:
|
||||||
|
if (!any_on_device) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
func = ggml_cuda_dup;
|
||||||
|
break;
|
||||||
case GGML_OP_RESHAPE:
|
case GGML_OP_RESHAPE:
|
||||||
case GGML_OP_VIEW:
|
case GGML_OP_VIEW:
|
||||||
case GGML_OP_PERMUTE:
|
case GGML_OP_PERMUTE:
|
||||||
|
|
45
ggml-metal.m
45
ggml-metal.m
|
@ -881,28 +881,35 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
const int n_past = ((int32_t *)(src1->data))[0];
|
const int n_past = ((int32_t *)(src1->data))[0];
|
||||||
|
|
||||||
|
float freq_base;
|
||||||
|
float freq_scale;
|
||||||
|
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||||
|
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_rope];
|
[encoder setComputePipelineState:ctx->pipeline_rope];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
||||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
||||||
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
||||||
|
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
||||||
|
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
|
|
|
@ -656,17 +656,19 @@ kernel void kernel_rope(
|
||||||
constant int & n_past,
|
constant int & n_past,
|
||||||
constant int & n_dims,
|
constant int & n_dims,
|
||||||
constant int & mode,
|
constant int & mode,
|
||||||
|
constant float & freq_base,
|
||||||
|
constant float & freq_scale,
|
||||||
uint3 tpig[[thread_position_in_grid]]) {
|
uint3 tpig[[thread_position_in_grid]]) {
|
||||||
const int64_t i3 = tpig[2];
|
const int64_t i3 = tpig[2];
|
||||||
const int64_t i2 = tpig[1];
|
const int64_t i2 = tpig[1];
|
||||||
const int64_t i1 = tpig[0];
|
const int64_t i1 = tpig[0];
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const float theta_scale = pow(10000.0, -2.0f/n_dims);
|
const float theta_scale = pow(freq_base, -2.0f/n_dims);
|
||||||
|
|
||||||
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
||||||
|
|
||||||
float theta = (float)p;
|
float theta = freq_scale * (float)p;
|
||||||
|
|
||||||
if (!is_neox) {
|
if (!is_neox) {
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
|
|
112
ggml.c
112
ggml.c
|
@ -4292,35 +4292,6 @@ static inline int ggml_up(int n, int m) {
|
||||||
#define ggml_assert_aligned(ptr) \
|
#define ggml_assert_aligned(ptr) \
|
||||||
GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
|
GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
|
||||||
|
|
||||||
static bool useNtkRope = true; //uses linear rope if not NTK
|
|
||||||
void set_ntk_rope_scale_mode(bool useNtk)
|
|
||||||
{
|
|
||||||
useNtkRope = useNtk;
|
|
||||||
}
|
|
||||||
bool get_ntk_rope_scale_mode()
|
|
||||||
{
|
|
||||||
return useNtkRope;
|
|
||||||
}
|
|
||||||
float get_theta_scale(int n_dims,int n_past,int n_ctx)
|
|
||||||
{
|
|
||||||
if (!get_ntk_rope_scale_mode())
|
|
||||||
{
|
|
||||||
return powf(10000.0, -2.0f / n_dims);
|
|
||||||
}
|
|
||||||
if (n_ctx <= 2048) //normie mode
|
|
||||||
{
|
|
||||||
return powf(10000.0, -2.0f / n_dims);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
//using scaled NTK aware ctx
|
|
||||||
float a = (n_ctx <= 4096 ? 4.0 : 8.0);
|
|
||||||
float m = powf(a, n_dims / (n_dims - 2.0));
|
|
||||||
float s = powf(10000.0 * m, -2.0f / n_dims);
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
struct ggml_context * ggml_init(struct ggml_init_params params) {
|
struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
|
@ -4442,8 +4413,8 @@ void ggml_free(struct ggml_context * ctx) {
|
||||||
if (&g_state.contexts[i].context == ctx) {
|
if (&g_state.contexts[i].context == ctx) {
|
||||||
g_state.contexts[i].used = false;
|
g_state.contexts[i].used = false;
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
|
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
|
||||||
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
|
__func__, i, ggml_used_mem(ctx));
|
||||||
|
|
||||||
if (ctx->mem_buffer_owned) {
|
if (ctx->mem_buffer_owned) {
|
||||||
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
||||||
|
@ -6986,6 +6957,8 @@ struct ggml_tensor * ggml_rope_impl(
|
||||||
int n_past,
|
int n_past,
|
||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
|
float freq_base,
|
||||||
|
float freq_scale,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
GGML_ASSERT(n_past >= 0);
|
GGML_ASSERT(n_past >= 0);
|
||||||
|
@ -6999,12 +6972,14 @@ struct ggml_tensor * ggml_rope_impl(
|
||||||
|
|
||||||
ggml_scratch_save(ctx);
|
ggml_scratch_save(ctx);
|
||||||
|
|
||||||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
|
||||||
|
|
||||||
((int32_t *) b->data)[0] = n_past;
|
((int32_t *) b->data)[0] = n_past;
|
||||||
((int32_t *) b->data)[1] = n_dims;
|
((int32_t *) b->data)[1] = n_dims;
|
||||||
((int32_t *) b->data)[2] = mode;
|
((int32_t *) b->data)[2] = mode;
|
||||||
((int32_t *) b->data)[3] = n_ctx;
|
((int32_t *) b->data)[3] = n_ctx;
|
||||||
|
memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
|
||||||
|
memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
|
||||||
|
|
||||||
ggml_scratch_load(ctx);
|
ggml_scratch_load(ctx);
|
||||||
|
|
||||||
|
@ -7023,7 +6998,7 @@ struct ggml_tensor * ggml_rope(
|
||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx) {
|
int n_ctx) {
|
||||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
|
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_rope_inplace(
|
struct ggml_tensor * ggml_rope_inplace(
|
||||||
|
@ -7033,7 +7008,19 @@ struct ggml_tensor * ggml_rope_inplace(
|
||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx) {
|
int n_ctx) {
|
||||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
|
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_rope_custom_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int n_past,
|
||||||
|
int n_dims,
|
||||||
|
int mode,
|
||||||
|
float freq_base,
|
||||||
|
float freq_scale,
|
||||||
|
int n_ctx) {
|
||||||
|
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_rope_back
|
// ggml_rope_back
|
||||||
|
@ -12104,16 +12091,21 @@ static void ggml_compute_forward_rope_f32(
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||||
GGML_ASSERT(ggml_nelements(src1) == 4);
|
GGML_ASSERT(ggml_nelements(src1) == 6);
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float freq_base;
|
||||||
|
float freq_scale;
|
||||||
|
|
||||||
const int n_past = ((int32_t *) src1->data)[0];
|
const int n_past = ((int32_t *) src1->data)[0];
|
||||||
const int n_dims = ((int32_t *) src1->data)[1];
|
const int n_dims = ((int32_t *) src1->data)[1];
|
||||||
const int mode = ((int32_t *) src1->data)[2];
|
const int mode = ((int32_t *) src1->data)[2];
|
||||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||||
|
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||||
|
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||||
|
|
||||||
assert(n_past >= 0);
|
assert(n_past >= 0);
|
||||||
|
|
||||||
|
@ -12142,7 +12134,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
// row index used to determine which thread to use
|
// row index used to determine which thread to use
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
@ -12154,7 +12146,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
if (ir++ < ir0) continue;
|
if (ir++ < ir0) continue;
|
||||||
if (ir > ir1) break;
|
if (ir > ir1) break;
|
||||||
|
|
||||||
float theta = (float)p;
|
float theta = freq_scale * (float)p;
|
||||||
|
|
||||||
if (is_glm) {
|
if (is_glm) {
|
||||||
theta = MIN(p, n_ctx - 2);
|
theta = MIN(p, n_ctx - 2);
|
||||||
|
@ -12182,9 +12174,6 @@ static void ggml_compute_forward_rope_f32(
|
||||||
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
||||||
}
|
}
|
||||||
} else if (!is_neox) {
|
} else if (!is_neox) {
|
||||||
if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
|
|
||||||
theta = theta * GGML_TRAINING_CTX / n_ctx;
|
|
||||||
}
|
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
const float cos_theta = cosf(theta);
|
const float cos_theta = cosf(theta);
|
||||||
const float sin_theta = sinf(theta);
|
const float sin_theta = sinf(theta);
|
||||||
|
@ -12234,16 +12223,21 @@ static void ggml_compute_forward_rope_f16(
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||||
GGML_ASSERT(ggml_nelements(src1) == 4);
|
GGML_ASSERT(ggml_nelements(src1) == 6);
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float freq_base;
|
||||||
|
float freq_scale;
|
||||||
|
|
||||||
const int n_past = ((int32_t *) src1->data)[0];
|
const int n_past = ((int32_t *) src1->data)[0];
|
||||||
const int n_dims = ((int32_t *) src1->data)[1];
|
const int n_dims = ((int32_t *) src1->data)[1];
|
||||||
const int mode = ((int32_t *) src1->data)[2];
|
const int mode = ((int32_t *) src1->data)[2];
|
||||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||||
|
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||||
|
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||||
|
|
||||||
assert(n_past >= 0);
|
assert(n_past >= 0);
|
||||||
|
|
||||||
|
@ -12272,7 +12266,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
// row index used to determine which thread to use
|
// row index used to determine which thread to use
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
@ -12284,7 +12278,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
if (ir++ < ir0) continue;
|
if (ir++ < ir0) continue;
|
||||||
if (ir > ir1) break;
|
if (ir > ir1) break;
|
||||||
|
|
||||||
float theta = (float)p;
|
float theta = freq_scale * (float)p;
|
||||||
|
|
||||||
if (is_glm) {
|
if (is_glm) {
|
||||||
theta = MIN(p, n_ctx - 2);
|
theta = MIN(p, n_ctx - 2);
|
||||||
|
@ -12312,9 +12306,6 @@ static void ggml_compute_forward_rope_f16(
|
||||||
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
||||||
}
|
}
|
||||||
} if (!is_neox) {
|
} if (!is_neox) {
|
||||||
if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
|
|
||||||
theta = theta * GGML_TRAINING_CTX / n_ctx;
|
|
||||||
}
|
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
const float cos_theta = cosf(theta);
|
const float cos_theta = cosf(theta);
|
||||||
const float sin_theta = sinf(theta);
|
const float sin_theta = sinf(theta);
|
||||||
|
@ -12348,7 +12339,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
||||||
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
||||||
|
|
||||||
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||||
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12400,7 +12391,6 @@ static void ggml_compute_forward_rope_back_f32(
|
||||||
const int n_past = ((int32_t *) src1->data)[0];
|
const int n_past = ((int32_t *) src1->data)[0];
|
||||||
const int n_dims = ((int32_t *) src1->data)[1];
|
const int n_dims = ((int32_t *) src1->data)[1];
|
||||||
const int mode = ((int32_t *) src1->data)[2];
|
const int mode = ((int32_t *) src1->data)[2];
|
||||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
|
||||||
|
|
||||||
assert(n_past >= 0);
|
assert(n_past >= 0);
|
||||||
|
|
||||||
|
@ -12426,7 +12416,7 @@ static void ggml_compute_forward_rope_back_f32(
|
||||||
// row index used to determine which thread to use
|
// row index used to determine which thread to use
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
|
|
||||||
|
@ -12440,9 +12430,6 @@ static void ggml_compute_forward_rope_back_f32(
|
||||||
float theta = (float)p;
|
float theta = (float)p;
|
||||||
|
|
||||||
if (!is_neox) {
|
if (!is_neox) {
|
||||||
if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
|
|
||||||
theta = theta * GGML_TRAINING_CTX / n_ctx;
|
|
||||||
}
|
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
const float cos_theta = cosf(theta);
|
const float cos_theta = cosf(theta);
|
||||||
const float sin_theta = sinf(theta);
|
const float sin_theta = sinf(theta);
|
||||||
|
@ -12503,7 +12490,6 @@ static void ggml_compute_forward_rope_back_f16(
|
||||||
const int n_past = ((int32_t *) src1->data)[0];
|
const int n_past = ((int32_t *) src1->data)[0];
|
||||||
const int n_dims = ((int32_t *) src1->data)[1];
|
const int n_dims = ((int32_t *) src1->data)[1];
|
||||||
const int mode = ((int32_t *) src1->data)[2];
|
const int mode = ((int32_t *) src1->data)[2];
|
||||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
|
||||||
|
|
||||||
assert(n_past >= 0);
|
assert(n_past >= 0);
|
||||||
|
|
||||||
|
@ -12529,7 +12515,7 @@ static void ggml_compute_forward_rope_back_f16(
|
||||||
// row index used to determine which thread to use
|
// row index used to determine which thread to use
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
|
|
||||||
|
@ -12543,9 +12529,6 @@ static void ggml_compute_forward_rope_back_f16(
|
||||||
float theta = (float)p;
|
float theta = (float)p;
|
||||||
|
|
||||||
if (!is_neox) {
|
if (!is_neox) {
|
||||||
if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
|
|
||||||
theta = theta * GGML_TRAINING_CTX / n_ctx;
|
|
||||||
}
|
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
const float cos_theta = cosf(theta);
|
const float cos_theta = cosf(theta);
|
||||||
const float sin_theta = sinf(theta);
|
const float sin_theta = sinf(theta);
|
||||||
|
@ -15754,7 +15737,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
// necessary for llama
|
// necessary for llama
|
||||||
if (src0->grad) {
|
if (src0->grad) {
|
||||||
assert(src1->type == GGML_TYPE_I32);
|
assert(src1->type == GGML_TYPE_I32);
|
||||||
assert(ggml_nelements(src1) == 4);
|
assert(ggml_nelements(src1) == 6);
|
||||||
const int n_past = ((int32_t *) src1->data)[0];
|
const int n_past = ((int32_t *) src1->data)[0];
|
||||||
const int n_dims = ((int32_t *) src1->data)[1];
|
const int n_dims = ((int32_t *) src1->data)[1];
|
||||||
const int mode = ((int32_t *) src1->data)[2];
|
const int mode = ((int32_t *) src1->data)[2];
|
||||||
|
@ -15775,7 +15758,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
{
|
{
|
||||||
if (src0->grad) {
|
if (src0->grad) {
|
||||||
assert(src1->type == GGML_TYPE_I32);
|
assert(src1->type == GGML_TYPE_I32);
|
||||||
assert(ggml_nelements(src1) == 4);
|
assert(ggml_nelements(src1) == 3);
|
||||||
const int n_past = ((int32_t *) src1->data)[0];
|
const int n_past = ((int32_t *) src1->data)[0];
|
||||||
const int n_dims = ((int32_t *) src1->data)[1];
|
const int n_dims = ((int32_t *) src1->data)[1];
|
||||||
const int mode = ((int32_t *) src1->data)[2];
|
const int mode = ((int32_t *) src1->data)[2];
|
||||||
|
@ -16335,8 +16318,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||||
params.nth = n_tasks_arr[node_n];
|
params.nth = n_tasks_arr[node_n];
|
||||||
ggml_compute_forward(¶ms, node);
|
ggml_compute_forward(¶ms, node);
|
||||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
|
||||||
}
|
}
|
||||||
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||||
}
|
}
|
||||||
|
|
||||||
// distribute new work or execute it direct if 1T
|
// distribute new work or execute it direct if 1T
|
||||||
|
@ -16366,8 +16349,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||||
params.type = GGML_TASK_FINALIZE;
|
params.type = GGML_TASK_FINALIZE;
|
||||||
ggml_compute_forward(¶ms, node);
|
ggml_compute_forward(¶ms, node);
|
||||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -16909,9 +16893,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
||||||
//assert(cgraph->work == NULL);
|
|
||||||
//assert(cgraph->work_size == 0);
|
|
||||||
|
|
||||||
uint64_t size_eval = 0;
|
uint64_t size_eval = 0;
|
||||||
|
|
||||||
// compute size of intermediate results
|
// compute size of intermediate results
|
||||||
|
@ -17350,9 +17331,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||||
|
|
||||||
GGML_PRINT("=== GRAPH ===\n");
|
GGML_PRINT("=== GRAPH ===\n");
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
|
|
||||||
GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
|
|
||||||
|
|
||||||
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
|
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = cgraph->nodes[i];
|
struct ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
21
ggml.h
21
ggml.h
|
@ -207,12 +207,6 @@
|
||||||
|
|
||||||
#define GGML_UNUSED(x) (void)(x)
|
#define GGML_UNUSED(x) (void)(x)
|
||||||
|
|
||||||
// Maximum training context of the model in use
|
|
||||||
// For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B)
|
|
||||||
#ifndef GGML_TRAINING_CTX
|
|
||||||
#define GGML_TRAINING_CTX 2048
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define GGML_ASSERT(x) \
|
#define GGML_ASSERT(x) \
|
||||||
do { \
|
do { \
|
||||||
if (!(x)) { \
|
if (!(x)) { \
|
||||||
|
@ -551,10 +545,6 @@ extern "C" {
|
||||||
// use this to compute the memory overhead of a tensor
|
// use this to compute the memory overhead of a tensor
|
||||||
GGML_API size_t ggml_tensor_overhead(void);
|
GGML_API size_t ggml_tensor_overhead(void);
|
||||||
|
|
||||||
GGML_API void set_ntk_rope_scale_mode(bool useNtk);
|
|
||||||
GGML_API bool get_ntk_rope_scale_mode();
|
|
||||||
GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
|
|
||||||
|
|
||||||
// main
|
// main
|
||||||
|
|
||||||
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||||
|
@ -1130,6 +1120,17 @@ extern "C" {
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx);
|
int n_ctx);
|
||||||
|
|
||||||
|
// custom RoPE, in-place, returns view(a)
|
||||||
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int n_past,
|
||||||
|
int n_dims,
|
||||||
|
int mode,
|
||||||
|
float freq_base,
|
||||||
|
float freq_scale,
|
||||||
|
int n_ctx);
|
||||||
|
|
||||||
// rotary position embedding backward, i.e compute dx from dy
|
// rotary position embedding backward, i.e compute dx from dy
|
||||||
// a - dy
|
// a - dy
|
||||||
GGML_API struct ggml_tensor * ggml_rope_back(
|
GGML_API struct ggml_tensor * ggml_rope_back(
|
||||||
|
|
|
@ -348,12 +348,32 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
= gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
|
= gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
|
||||||
= mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
|
= mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
|
||||||
|
|
||||||
//handle linear rope
|
//determine rope scaling params
|
||||||
if(inputs.linear_rope)
|
float rope_freq_scale = 1.0f;
|
||||||
|
float rope_freq_base = 10000.0f;
|
||||||
|
if(inputs.rope_freq_scale>0.0f)
|
||||||
{
|
{
|
||||||
printf("Using Linear RoPE scaling instead of NTK-Aware scaling.\n");
|
rope_freq_scale = inputs.rope_freq_scale;
|
||||||
|
rope_freq_base = inputs.rope_freq_base;
|
||||||
|
printf("Using Custom RoPE scaling (scale:%.3f, base:%.1f).\n",rope_freq_scale,rope_freq_base);
|
||||||
}
|
}
|
||||||
set_ntk_rope_scale_mode(!inputs.linear_rope);
|
else
|
||||||
|
{
|
||||||
|
rope_freq_scale = 1.0f;
|
||||||
|
if (params.n_ctx <= 2048) //normie mode
|
||||||
|
{
|
||||||
|
rope_freq_base = 10000.0f;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//approximate NTK aware ctx
|
||||||
|
rope_freq_base = (params.n_ctx <= 4096 ? 40880.0f : 82684.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Using automatic RoPE scaling (scale:%.3f, base:%.1f)\n",rope_freq_scale,rope_freq_base);
|
||||||
|
}
|
||||||
|
gptj_ctx_v3.hparams.rope_freq_scale = neox_ctx_v3.hparams.rope_freq_scale = rope_freq_scale;
|
||||||
|
gptj_ctx_v3.hparams.rope_freq_base = neox_ctx_v3.hparams.rope_freq_base = rope_freq_base;
|
||||||
|
|
||||||
//handle custom token bans
|
//handle custom token bans
|
||||||
banned_tokens.clear();
|
banned_tokens.clear();
|
||||||
|
@ -444,6 +464,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
llama_ctx_params.use_mlock = inputs.use_mlock;
|
llama_ctx_params.use_mlock = inputs.use_mlock;
|
||||||
llama_ctx_params.n_gpu_layers = inputs.gpulayers;
|
llama_ctx_params.n_gpu_layers = inputs.gpulayers;
|
||||||
llama_ctx_params.main_gpu = cu_parseinfo_maindevice;
|
llama_ctx_params.main_gpu = cu_parseinfo_maindevice;
|
||||||
|
llama_ctx_params.rope_freq_base = rope_freq_base;
|
||||||
|
llama_ctx_params.rope_freq_scale = rope_freq_scale;
|
||||||
|
|
||||||
llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
|
llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
|
||||||
|
|
||||||
|
|
11
koboldcpp.py
11
koboldcpp.py
|
@ -36,7 +36,8 @@ class load_model_inputs(ctypes.Structure):
|
||||||
("debugmode", ctypes.c_int),
|
("debugmode", ctypes.c_int),
|
||||||
("forceversion", ctypes.c_int),
|
("forceversion", ctypes.c_int),
|
||||||
("gpulayers", ctypes.c_int),
|
("gpulayers", ctypes.c_int),
|
||||||
("linear_rope", ctypes.c_bool),
|
("rope_freq_scale", ctypes.c_float),
|
||||||
|
("rope_freq_base", ctypes.c_float),
|
||||||
("banned_tokens", ctypes.c_char_p * ban_token_max)]
|
("banned_tokens", ctypes.c_char_p * ban_token_max)]
|
||||||
|
|
||||||
class generation_inputs(ctypes.Structure):
|
class generation_inputs(ctypes.Structure):
|
||||||
|
@ -189,7 +190,11 @@ def load_model(model_filename):
|
||||||
inputs.blasbatchsize = args.blasbatchsize
|
inputs.blasbatchsize = args.blasbatchsize
|
||||||
inputs.forceversion = args.forceversion
|
inputs.forceversion = args.forceversion
|
||||||
inputs.gpulayers = args.gpulayers
|
inputs.gpulayers = args.gpulayers
|
||||||
inputs.linear_rope = args.linearrope
|
inputs.rope_freq_scale = args.ropeconfig[0]
|
||||||
|
if len(args.ropeconfig)>1:
|
||||||
|
inputs.rope_freq_base = args.ropeconfig[1]
|
||||||
|
else:
|
||||||
|
inputs.rope_freq_base = 10000
|
||||||
clblastids = 0
|
clblastids = 0
|
||||||
if args.useclblast:
|
if args.useclblast:
|
||||||
clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])
|
clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])
|
||||||
|
@ -1434,7 +1439,7 @@ if __name__ == '__main__':
|
||||||
parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
|
parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
|
||||||
parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192], default=2048)
|
parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192], default=2048)
|
||||||
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512)
|
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512)
|
||||||
parser.add_argument("--linearrope", help="If set, uses linear RoPE scaling. Otherwise, uses NTK-Aware scaling.", action='store_true')
|
parser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
|
||||||
parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
|
parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
|
||||||
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
|
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
|
||||||
parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents the EOS token from being generated. This flag unbans it.", action='store_true')
|
parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents the EOS token from being generated. This flag unbans it.", action='store_true')
|
||||||
|
|
107
llama.cpp
107
llama.cpp
|
@ -102,14 +102,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
||||||
// memory sizes
|
// memory sizes
|
||||||
//
|
//
|
||||||
|
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
{ MODEL_3B, 256ull * MB },
|
/* empirical scaling, still a guess */
|
||||||
{ MODEL_7B, 512ull * MB },
|
{ MODEL_3B, ((size_t) n_ctx / 16ull + 180ull) * MB },
|
||||||
{ MODEL_13B, 512ull * MB },
|
{ MODEL_7B, ((size_t) n_ctx / 16ull + 320ull) * MB },
|
||||||
{ MODEL_30B, 640ull * MB },
|
{ MODEL_13B, ((size_t) n_ctx / 12ull + 460ull) * MB },
|
||||||
{ MODEL_65B, 1024ull * MB },
|
{ MODEL_30B, ((size_t) n_ctx / 10ull + 620ull) * MB },
|
||||||
|
{ MODEL_65B, ((size_t) n_ctx / 8ull + 860ull) * MB },
|
||||||
};
|
};
|
||||||
return k_sizes;
|
return k_sizes;
|
||||||
}
|
}
|
||||||
|
@ -141,14 +142,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
||||||
|
|
||||||
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
||||||
// not actually needed if BLAS is disabled
|
// not actually needed if BLAS is disabled
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
{ MODEL_3B, 512ull * MB },
|
{ MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
|
||||||
{ MODEL_7B, 800ull * MB },
|
{ MODEL_7B, ((size_t) n_ctx / 256ull + 800ull) * MB },
|
||||||
{ MODEL_13B, 1024ull * MB },
|
{ MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
|
||||||
{ MODEL_30B, 1380ull * MB },
|
{ MODEL_30B, ((size_t) n_ctx / 256ull + 1380ull) * MB },
|
||||||
{ MODEL_65B, 1536ull * MB },
|
{ MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
|
||||||
};
|
};
|
||||||
return k_sizes;
|
return k_sizes;
|
||||||
}
|
}
|
||||||
|
@ -190,6 +191,10 @@ struct llama_hparams {
|
||||||
uint32_t n_head = 32;
|
uint32_t n_head = 32;
|
||||||
uint32_t n_layer = 32;
|
uint32_t n_layer = 32;
|
||||||
uint32_t n_rot = 64;
|
uint32_t n_rot = 64;
|
||||||
|
|
||||||
|
float rope_freq_base = 10000.0f;
|
||||||
|
float rope_freq_scale = 1.0f;
|
||||||
|
|
||||||
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
|
@ -648,7 +653,7 @@ struct llama_model_loader {
|
||||||
*ctx_size_p = *mmapped_size_p = 0;
|
*ctx_size_p = *mmapped_size_p = 0;
|
||||||
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
||||||
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
||||||
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -844,6 +849,8 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.gpu_layers =*/ 0,
|
/*.gpu_layers =*/ 0,
|
||||||
/*.main_gpu =*/ 0,
|
/*.main_gpu =*/ 0,
|
||||||
/*.tensor_split =*/ {0},
|
/*.tensor_split =*/ {0},
|
||||||
|
/*.rope_freq_base =*/ 10000.0f,
|
||||||
|
/*.rope_freq_scale =*/ 1.0f,
|
||||||
/*.progress_callback =*/ nullptr,
|
/*.progress_callback =*/ nullptr,
|
||||||
/*.progress_callback_user_data =*/ nullptr,
|
/*.progress_callback_user_data =*/ nullptr,
|
||||||
/*.low_vram =*/ false,
|
/*.low_vram =*/ false,
|
||||||
|
@ -967,6 +974,8 @@ static void llama_model_load_internal(
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
const float * tensor_split,
|
const float * tensor_split,
|
||||||
|
float rope_freq_base,
|
||||||
|
float rope_freq_scale,
|
||||||
bool low_vram,
|
bool low_vram,
|
||||||
ggml_type memory_type,
|
ggml_type memory_type,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
|
@ -1001,22 +1010,27 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
hparams.n_ctx = n_ctx;
|
hparams.n_ctx = n_ctx;
|
||||||
|
|
||||||
|
hparams.rope_freq_base = rope_freq_base;
|
||||||
|
hparams.rope_freq_scale = rope_freq_scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||||
|
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
||||||
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||||
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
||||||
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||||||
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
||||||
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
||||||
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||||
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||||
|
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
||||||
|
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
||||||
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
||||||
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
||||||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
||||||
|
@ -1162,13 +1176,12 @@ static void llama_model_load_internal(
|
||||||
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
||||||
|
|
||||||
// this is the total memory required to run the inference
|
// this is the total memory required to run the inference
|
||||||
const size_t bigctxmul = (hparams.n_ctx>4096?3:(hparams.n_ctx>2048?2:1));
|
|
||||||
const size_t mem_required =
|
const size_t mem_required =
|
||||||
ctx_size +
|
ctx_size +
|
||||||
mmapped_size - vram_weights + // weights in VRAM not in memory
|
mmapped_size - vram_weights + // weights in VRAM not in memory
|
||||||
MEM_REQ_SCRATCH0().at(model.type)*bigctxmul +
|
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
||||||
MEM_REQ_SCRATCH1().at(model.type)*bigctxmul +
|
MEM_REQ_SCRATCH1().at(model.type) +
|
||||||
MEM_REQ_EVAL().at (model.type);
|
MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
|
||||||
|
|
||||||
// this is the memory required by one llama_state
|
// this is the memory required by one llama_state
|
||||||
const size_t mem_required_state =
|
const size_t mem_required_state =
|
||||||
|
@ -1272,6 +1285,8 @@ static bool llama_model_load(
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
float * tensor_split,
|
float * tensor_split,
|
||||||
|
float rope_freq_base,
|
||||||
|
float rope_freq_scale,
|
||||||
bool low_vram,
|
bool low_vram,
|
||||||
ggml_type memory_type,
|
ggml_type memory_type,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
|
@ -1280,7 +1295,7 @@ static bool llama_model_load(
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void *progress_callback_user_data) {
|
void *progress_callback_user_data) {
|
||||||
try {
|
try {
|
||||||
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
||||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||||
return true;
|
return true;
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
|
@ -1332,6 +1347,9 @@ static bool llama_eval_internal(
|
||||||
const int n_rot = hparams.n_embd/hparams.n_head;
|
const int n_rot = hparams.n_embd/hparams.n_head;
|
||||||
const int n_gpu_layers = model.n_gpu_layers;
|
const int n_gpu_layers = model.n_gpu_layers;
|
||||||
|
|
||||||
|
const float freq_base = hparams.rope_freq_base;
|
||||||
|
const float freq_scale = hparams.rope_freq_scale;
|
||||||
|
|
||||||
auto & mem_per_token = lctx.mem_per_token;
|
auto & mem_per_token = lctx.mem_per_token;
|
||||||
auto & buf_compute = lctx.buf_compute;
|
auto & buf_compute = lctx.buf_compute;
|
||||||
|
|
||||||
|
@ -1429,11 +1447,11 @@ static bool llama_eval_internal(
|
||||||
offload_func_kq(tmpq);
|
offload_func_kq(tmpq);
|
||||||
ggml_set_name(tmpq, "tmpq");
|
ggml_set_name(tmpq, "tmpq");
|
||||||
|
|
||||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
|
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
|
||||||
offload_func_kq(Kcur);
|
offload_func_kq(Kcur);
|
||||||
ggml_set_name(Kcur, "Kcur");
|
ggml_set_name(Kcur, "Kcur");
|
||||||
|
|
||||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
|
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
|
||||||
offload_func_kq(Qcur);
|
offload_func_kq(Qcur);
|
||||||
ggml_set_name(Qcur, "Qcur");
|
ggml_set_name(Qcur, "Qcur");
|
||||||
|
|
||||||
|
@ -2007,9 +2025,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
||||||
}
|
}
|
||||||
|
|
||||||
// Normalize the second derivatives
|
// Normalize the second derivatives
|
||||||
float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
{
|
||||||
for (float & value : second_derivatives) {
|
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
||||||
value /= second_derivatives_sum;
|
|
||||||
|
if (second_derivatives_sum > 1e-6f) {
|
||||||
|
for (float & value : second_derivatives) {
|
||||||
|
value /= second_derivatives_sum;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (float & value : second_derivatives) {
|
||||||
|
value = 1.0f / second_derivatives.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
float cum_sum = 0.0f;
|
float cum_sum = 0.0f;
|
||||||
|
@ -2188,7 +2215,7 @@ void llama_sample_classifier_free_guidance(
|
||||||
struct llama_context * guidance_ctx,
|
struct llama_context * guidance_ctx,
|
||||||
float scale,
|
float scale,
|
||||||
float smooth_factor) {
|
float smooth_factor) {
|
||||||
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
|
int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
assert(ctx);
|
assert(ctx);
|
||||||
auto n_vocab = llama_n_vocab(ctx);
|
auto n_vocab = llama_n_vocab(ctx);
|
||||||
|
@ -2676,8 +2703,9 @@ struct llama_model * llama_load_model_from_file(
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
||||||
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
||||||
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
||||||
|
params.progress_callback_user_data)) {
|
||||||
delete model;
|
delete model;
|
||||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -2752,11 +2780,10 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->embedding.resize(hparams.n_embd);
|
ctx->embedding.resize(hparams.n_embd);
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
|
||||||
|
|
||||||
const size_t bigctxmul = (hparams.n_ctx>4096?3:(hparams.n_ctx>2048?2:1));
|
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
||||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)*bigctxmul);
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
||||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)*bigctxmul);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
|
|
5
llama.h
5
llama.h
|
@ -89,6 +89,11 @@ extern "C" {
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
||||||
|
|
||||||
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
|
float rope_freq_base; // RoPE base frequency
|
||||||
|
float rope_freq_scale; // RoPE frequency scaling factor
|
||||||
|
|
||||||
// called with a progress value between 0 and 1, pass NULL to disable
|
// called with a progress value between 0 and 1, pass NULL to disable
|
||||||
llama_progress_callback progress_callback;
|
llama_progress_callback progress_callback;
|
||||||
// context pointer passed to the progress callback
|
// context pointer passed to the progress callback
|
||||||
|
|
|
@ -409,6 +409,9 @@ bool gptj_eval(
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
const int n_rot = hparams.n_rot;
|
const int n_rot = hparams.n_rot;
|
||||||
|
|
||||||
|
const float freq_base = hparams.rope_freq_base;
|
||||||
|
const float freq_scale = hparams.rope_freq_scale;
|
||||||
|
|
||||||
static size_t buf_size = 256u*1024*1024;
|
static size_t buf_size = 256u*1024*1024;
|
||||||
static void * buf = malloc(buf_size);
|
static void * buf = malloc(buf_size);
|
||||||
|
|
||||||
|
@ -475,8 +478,8 @@ bool gptj_eval(
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
|
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, n_ctx);
|
||||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
|
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, n_ctx);
|
||||||
|
|
||||||
// store key and value to memory
|
// store key and value to memory
|
||||||
{
|
{
|
||||||
|
|
|
@ -425,6 +425,9 @@ bool gpt_neox_eval(
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
const int n_rot = hparams.n_rot;
|
const int n_rot = hparams.n_rot;
|
||||||
|
|
||||||
|
const float freq_base = hparams.rope_freq_base;
|
||||||
|
const float freq_scale = hparams.rope_freq_scale;
|
||||||
|
|
||||||
static size_t buf_size = 256u*1024*1024;
|
static size_t buf_size = 256u*1024*1024;
|
||||||
static void * buf = malloc(buf_size);
|
static void * buf = malloc(buf_size);
|
||||||
|
|
||||||
|
@ -503,8 +506,8 @@ bool gpt_neox_eval(
|
||||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
|
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
|
||||||
|
|
||||||
// using mode = 2 for GPT-NeoX mode
|
// using mode = 2 for GPT-NeoX mode
|
||||||
Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, n_ctx);
|
Qcur = ggml_rope_custom_inplace(ctx0, Qcur, n_past, n_rot, 2, freq_base, freq_scale, n_ctx);
|
||||||
Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, n_ctx);
|
Kcur = ggml_rope_custom_inplace(ctx0, Kcur, n_past, n_rot, 2, freq_base, freq_scale, n_ctx);
|
||||||
|
|
||||||
// store key and value to memory
|
// store key and value to memory
|
||||||
{
|
{
|
||||||
|
|
|
@ -24,6 +24,9 @@ struct gptj_hparams {
|
||||||
int32_t n_layer = 28;
|
int32_t n_layer = 28;
|
||||||
int32_t n_rot = 64;
|
int32_t n_rot = 64;
|
||||||
int32_t ftype = 1;
|
int32_t ftype = 1;
|
||||||
|
|
||||||
|
float rope_freq_base = 10000.0f;
|
||||||
|
float rope_freq_scale = 1.0f;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gptj_layer {
|
struct gptj_layer {
|
||||||
|
@ -309,6 +312,9 @@ struct gpt_neox_hparams {
|
||||||
int32_t n_rot = 32; // rotary_pct * (n_embd / n_head)
|
int32_t n_rot = 32; // rotary_pct * (n_embd / n_head)
|
||||||
int32_t par_res = 1; // 1 = true, 0 = false
|
int32_t par_res = 1; // 1 = true, 0 = false
|
||||||
int32_t ftype = 1;
|
int32_t ftype = 1;
|
||||||
|
|
||||||
|
float rope_freq_base = 10000.0f;
|
||||||
|
float rope_freq_scale = 1.0f;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpt_neox_layer_v2 {
|
struct gpt_neox_layer_v2 {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue