Reworking rope WIP

This commit is contained in:
Concedo 2023-07-19 00:54:41 +08:00
commit 374fffb9c6
24 changed files with 600 additions and 256 deletions

6
.gitignore vendored
View file

@ -16,6 +16,8 @@ build/
build-em/ build-em/
build-debug/ build-debug/
build-release/ build-release/
build-ci-debug/
build-ci-release/
build-static/ build-static/
build-cublas/ build-cublas/
build-opencl/ build-opencl/
@ -25,6 +27,10 @@ build-no-accel/
build-sanitize-addr/ build-sanitize-addr/
build-sanitize-thread/ build-sanitize-thread/
out/ out/
tmp/
models/*
models-mnt
/main /main
/quantize /quantize

20
ci/README.md Normal file
View file

@ -0,0 +1,20 @@
# CI
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
https://github.com/ggml-org/ci
It monitors the `master` branch for new commits and runs the
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
to cover various hardware architectures, including GPU and Apple Silicon instances.
Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
Only the branches of this repo are monitored for this keyword.
It is a good practice, before publishing changes to execute the full CI locally on your machine:
```bash
mkdir tmp
bash ./ci/run.sh ./tmp/results ./tmp/mnt
```

262
ci/run.sh Normal file
View file

@ -0,0 +1,262 @@
#/bin/bash
if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
exit 1
fi
mkdir -p "$1"
mkdir -p "$2"
OUT=$(realpath "$1")
MNT=$(realpath "$2")
rm -v $OUT/*.log
rm -v $OUT/*.exit
rm -v $OUT/*.md
sd=`dirname $0`
cd $sd/../
SRC=`pwd`
## helpers
# download a file if it does not exist or if it is outdated
function gg_wget {
local out=$1
local url=$2
local cwd=`pwd`
mkdir -p $out
cd $out
# should not re-download if file is the same
wget -nv -N $url
cd $cwd
}
function gg_printf {
printf -- "$@" >> $OUT/README.md
}
function gg_run {
ci=$1
set -o pipefail
set -x
gg_run_$ci | tee $OUT/$ci.log
cur=$?
echo "$cur" > $OUT/$ci.exit
set +x
set +o pipefail
gg_sum_$ci
ret=$((ret | cur))
}
## ci
# ctest_debug
function gg_run_ctest_debug {
cd ${SRC}
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
set -e
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e
}
function gg_sum_ctest_debug {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs ctest in debug mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
gg_printf '```\n'
gg_printf '\n'
}
# ctest_release
function gg_run_ctest_release {
cd ${SRC}
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z $GG_BUILD_LOW_PERF ]; then
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
else
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
fi
set +e
}
function gg_sum_ctest_release {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs ctest in release mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
gg_printf '```\n'
}
# open_llama_3b_v2
function gg_run_open_llama_3b_v2 {
cd ${SRC}
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
path_models="../models-mnt/open-llama/3B-v2"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert.py ${path_models}
model_f16="${path_models}/ggml-model-f16.bin"
model_q8_0="${path_models}/ggml-model-q8_0.bin"
model_q4_0="${path_models}/ggml-model-q4_0.bin"
model_q4_1="${path_models}/ggml-model-q4_1.bin"
model_q5_0="${path_models}/ggml-model-q5_0.bin"
model_q5_1="${path_models}/ggml-model-q5_1.bin"
model_q3_k="${path_models}/ggml-model-q3_k.bin"
model_q4_k="${path_models}/ggml-model-q4_k.bin"
model_q5_k="${path_models}/ggml-model-q5_k.bin"
model_q6_k="${path_models}/ggml-model-q6_k.bin"
wiki_test_60="${path_wiki}/wiki.test-60.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
return 0
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
set +e
}
function gg_sum_open_llama_3b_v2 {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'OpenLLaMA 3B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
}
## main
if [ -z $GG_BUILD_LOW_PERF ]; then
rm -rf ${SRC}/models-mnt
mnt_models=$(realpath ${MNT}/models)
mkdir -p ${mnt_models}
ln -sfn ${mnt_models} ${SRC}/models-mnt
python3 -m pip install -r ${SRC}/requirements.txt
fi
ret=0
#test $ret -eq 0 && gg_run ctest_debug
#test $ret -eq 0 && gg_run ctest_release
if [ -z $GG_BUILD_LOW_PERF ]; then
test $ret -eq 0 && gg_run open_llama_3b_v2
fi
exit $ret

View file

@ -168,6 +168,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.n_ctx = std::stoi(argv[i]); params.n_ctx = std::stoi(argv[i]);
} else if (arg == "--rope-freq-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_base = std::stof(argv[i]);
} else if (arg == "--rope-freq-scale") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_scale = std::stof(argv[i]);
} else if (arg == "--memory-f32") { } else if (arg == "--memory-f32") {
params.memory_f16 = false; params.memory_f16 = false;
} else if (arg == "--top-p") { } else if (arg == "--top-p") {
@ -267,6 +279,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.n_keep = std::stoi(argv[i]); params.n_keep = std::stoi(argv[i]);
} else if (arg == "--chunks") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_chunks = std::stoi(argv[i]);
} else if (arg == "-m" || arg == "--model") { } else if (arg == "-m" || arg == "--model") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -493,6 +511,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor); fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n"); fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
@ -501,6 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
if (llama_mlock_supported()) { if (llama_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }
@ -573,6 +594,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
lparams.use_mlock = params.use_mlock; lparams.use_mlock = params.use_mlock;
lparams.logits_all = params.perplexity; lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding; lparams.embedding = params.embedding;
lparams.rope_freq_base = params.rope_freq_base;
lparams.rope_freq_scale = params.rope_freq_scale;
return lparams; return lparams;
} }

View file

@ -28,10 +28,13 @@ struct gpt_params {
int32_t n_ctx = 512; // context size int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = 0; // number of layers to store in VRAM int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
float rope_freq_base = 10000.0f; // RoPE base frequency
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
// sampling parameters // sampling parameters
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens

View file

@ -84,9 +84,17 @@ int main(int argc, char ** argv) {
return 0; return 0;
} }
if (params.rope_freq_base != 10000.0) {
fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
}
if (params.rope_freq_scale != 1.0) {
fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
}
if (params.n_ctx > 2048) { if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);" fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx); " you are on your own\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) { } else if (params.n_ctx < 8) {
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8; params.n_ctx = 8;

View file

@ -32,13 +32,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
// BOS tokens will be added for each chunk before eval // BOS tokens will be added for each chunk before eval
auto tokens = ::llama_tokenize(ctx, params.prompt, true); auto tokens = ::llama_tokenize(ctx, params.prompt, true);
int count = 0; const int n_chunk_max = tokens.size() / params.n_ctx;
const int n_chunk = tokens.size() / params.n_ctx; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(ctx); const int n_vocab = llama_n_vocab(ctx);
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
int count = 0;
double nll = 0.0; double nll = 0.0;
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
for (int i = 0; i < n_chunk; ++i) { for (int i = 0; i < n_chunk; ++i) {

View file

@ -12,103 +12,27 @@ struct quant_option {
}; };
static const std::vector<struct quant_option> QUANT_OPTIONS = { static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", },
"Q4_0", { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", },
LLAMA_FTYPE_MOSTLY_Q4_0, { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", },
" 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M", { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", },
},
{
"Q4_1",
LLAMA_FTYPE_MOSTLY_Q4_1,
" 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
},
{
"Q5_0",
LLAMA_FTYPE_MOSTLY_Q5_0,
" 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
},
{
"Q5_1",
LLAMA_FTYPE_MOSTLY_Q5_1,
" 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
},
#ifdef GGML_USE_K_QUANTS #ifdef GGML_USE_K_QUANTS
{ { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", },
"Q2_K", { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
LLAMA_FTYPE_MOSTLY_Q2_K, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
" 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended", { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
}, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
{ { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
"Q3_K", { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
LLAMA_FTYPE_MOSTLY_Q3_K_M, { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
"alias for Q3_K_M" { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
}, { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
{ { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
"Q3_K_S", { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", },
LLAMA_FTYPE_MOSTLY_Q3_K_S,
" 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
},
{
"Q3_K_M",
LLAMA_FTYPE_MOSTLY_Q3_K_M,
" 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
},
{
"Q3_K_L",
LLAMA_FTYPE_MOSTLY_Q3_K_L,
" 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
},
{
"Q4_K",
LLAMA_FTYPE_MOSTLY_Q4_K_M,
"alias for Q4_K_M",
},
{
"Q4_K_S",
LLAMA_FTYPE_MOSTLY_Q4_K_S,
" 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
},
{
"Q4_K_M",
LLAMA_FTYPE_MOSTLY_Q4_K_M,
" 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
},
{
"Q5_K",
LLAMA_FTYPE_MOSTLY_Q5_K_M,
"alias for Q5_K_M",
},
{
"Q5_K_S",
LLAMA_FTYPE_MOSTLY_Q5_K_S,
" 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
},
{
"Q5_K_M",
LLAMA_FTYPE_MOSTLY_Q5_K_M,
" 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
},
{
"Q6_K",
LLAMA_FTYPE_MOSTLY_Q6_K,
" 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
},
#endif #endif
{ { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", },
"Q8_0", { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
LLAMA_FTYPE_MOSTLY_Q8_0, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
" 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
},
{
"F16",
LLAMA_FTYPE_MOSTLY_F16,
"13.00G @ 7B - extremely large, virtually no quality loss - not recommended",
},
{
"F32",
LLAMA_FTYPE_ALL_F32,
"26.00G @ 7B - absolutely huge, lossless - not recommended",
},
}; };

View file

@ -66,6 +66,7 @@ Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the
```sh ```sh
curl --request POST \ curl --request POST \
--url http://localhost:8080/completion \ --url http://localhost:8080/completion \
--header "Content-Type: application/json" \
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}' --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
``` ```

View file

@ -32,6 +32,7 @@ tokenize() {
--silent \ --silent \
--request POST \ --request POST \
--url "${API_URL}/tokenize" \ --url "${API_URL}/tokenize" \
--header "Content-Type: application/json" \
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \ --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
| jq '.tokens[]' | jq '.tokens[]'
} }
@ -64,6 +65,7 @@ chat_completion() {
--no-buffer \ --no-buffer \
--request POST \ --request POST \
--url "${API_URL}/completion" \ --url "${API_URL}/completion" \
--header "Content-Type: application/json" \
--data-raw "${DATA}") --data-raw "${DATA}")
printf "\n" printf "\n"

View file

@ -608,6 +608,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
@ -722,6 +724,22 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
params.n_ctx = std::stoi(argv[i]); params.n_ctx = std::stoi(argv[i]);
} }
else if (arg == "--rope-freq-base")
{
if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_base = std::stof(argv[i]);
}
else if (arg == "--rope-freq-scale")
{
if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_scale = std::stof(argv[i]);
}
else if (arg == "--memory-f32" || arg == "--memory_f32") else if (arg == "--memory-f32" || arg == "--memory_f32")
{ {
params.memory_f16 = false; params.memory_f16 = false;

View file

@ -36,7 +36,8 @@ struct load_model_inputs
const int debugmode = 0; const int debugmode = 0;
const int forceversion = 0; const int forceversion = 0;
const int gpulayers = 0; const int gpulayers = 0;
const bool linear_rope; const float rope_freq_scale = 1.0f;
const float rope_freq_base = 10000.0f;
const char * banned_tokens[ban_token_max]; const char * banned_tokens[ban_token_max];
}; };
struct generation_inputs struct generation_inputs

View file

@ -2977,10 +2977,8 @@ inline void ggml_cuda_op_rope(
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3]; const int n_ctx = ((int32_t *) src1->data)[3];
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); const float theta_scale = powf(10000.0, -2.0f/n_dims);
const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02); const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
const float p = get_ntk_rope_scale_mode()?p0:(n_ctx <= GGML_TRAINING_CTX ? p0 : p0 * GGML_TRAINING_CTX / n_ctx);
bool is_glm = mode & 4; bool is_glm = mode & 4;
@ -3564,6 +3562,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
(void) dst; (void) dst;
} }
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_cpy(src0, dst, nullptr);
(void) src1;
}
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true); ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
@ -3697,7 +3700,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
// recursively assign CUDA buffers until a compute tensor is found // recursively assign CUDA buffers until a compute tensor is found
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
const ggml_op src0_op = tensor->src[0]->op; const ggml_op src0_op = tensor->src[0]->op;
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) { if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace); ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
} }
} }
@ -3803,6 +3806,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
switch (tensor->op) { switch (tensor->op) {
case GGML_OP_DUP:
if (!any_on_device) {
return false;
}
func = ggml_cuda_dup;
break;
case GGML_OP_ADD: case GGML_OP_ADD:
if (!any_on_device) { if (!any_on_device) {
return false; return false;
@ -3857,6 +3866,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
} }
func = ggml_cuda_cpy; func = ggml_cuda_cpy;
break; break;
case GGML_OP_CONT:
if (!any_on_device) {
return false;
}
func = ggml_cuda_dup;
break;
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
case GGML_OP_VIEW: case GGML_OP_VIEW:
case GGML_OP_PERMUTE: case GGML_OP_PERMUTE:

View file

@ -881,28 +881,35 @@ void ggml_metal_graph_compute(
const int n_past = ((int32_t *)(src1->data))[0]; const int n_past = ((int32_t *)(src1->data))[0];
float freq_base;
float freq_scale;
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
[encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setComputePipelineState:ctx->pipeline_rope];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&n_past length:sizeof( int) atIndex:18]; [encoder setBytes:&n_past length:sizeof( int) atIndex:18];
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; [encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
[encoder setBytes:&mode length:sizeof( int) atIndex:20]; [encoder setBytes:&mode length:sizeof( int) atIndex:20];
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;

View file

@ -656,17 +656,19 @@ kernel void kernel_rope(
constant int & n_past, constant int & n_past,
constant int & n_dims, constant int & n_dims,
constant int & mode, constant int & mode,
constant float & freq_base,
constant float & freq_scale,
uint3 tpig[[thread_position_in_grid]]) { uint3 tpig[[thread_position_in_grid]]) {
const int64_t i3 = tpig[2]; const int64_t i3 = tpig[2];
const int64_t i2 = tpig[1]; const int64_t i2 = tpig[1];
const int64_t i1 = tpig[0]; const int64_t i1 = tpig[0];
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const float theta_scale = pow(10000.0, -2.0f/n_dims); const float theta_scale = pow(freq_base, -2.0f/n_dims);
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
float theta = (float)p; float theta = freq_scale * (float)p;
if (!is_neox) { if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) {

112
ggml.c
View file

@ -4292,35 +4292,6 @@ static inline int ggml_up(int n, int m) {
#define ggml_assert_aligned(ptr) \ #define ggml_assert_aligned(ptr) \
GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
static bool useNtkRope = true; //uses linear rope if not NTK
void set_ntk_rope_scale_mode(bool useNtk)
{
useNtkRope = useNtk;
}
bool get_ntk_rope_scale_mode()
{
return useNtkRope;
}
float get_theta_scale(int n_dims,int n_past,int n_ctx)
{
if (!get_ntk_rope_scale_mode())
{
return powf(10000.0, -2.0f / n_dims);
}
if (n_ctx <= 2048) //normie mode
{
return powf(10000.0, -2.0f / n_dims);
}
else
{
//using scaled NTK aware ctx
float a = (n_ctx <= 4096 ? 4.0 : 8.0);
float m = powf(a, n_dims / (n_dims - 2.0));
float s = powf(10000.0 * m, -2.0f / n_dims);
return s;
}
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
struct ggml_context * ggml_init(struct ggml_init_params params) { struct ggml_context * ggml_init(struct ggml_init_params params) {
@ -4442,8 +4413,8 @@ void ggml_free(struct ggml_context * ctx) {
if (&g_state.contexts[i].context == ctx) { if (&g_state.contexts[i].context == ctx) {
g_state.contexts[i].used = false; g_state.contexts[i].used = false;
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); __func__, i, ggml_used_mem(ctx));
if (ctx->mem_buffer_owned) { if (ctx->mem_buffer_owned) {
GGML_ALIGNED_FREE(ctx->mem_buffer); GGML_ALIGNED_FREE(ctx->mem_buffer);
@ -6986,6 +6957,8 @@ struct ggml_tensor * ggml_rope_impl(
int n_past, int n_past,
int n_dims, int n_dims,
int mode, int mode,
float freq_base,
float freq_scale,
int n_ctx, int n_ctx,
bool inplace) { bool inplace) {
GGML_ASSERT(n_past >= 0); GGML_ASSERT(n_past >= 0);
@ -6999,12 +6972,14 @@ struct ggml_tensor * ggml_rope_impl(
ggml_scratch_save(ctx); ggml_scratch_save(ctx);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_dims; ((int32_t *) b->data)[1] = n_dims;
((int32_t *) b->data)[2] = mode; ((int32_t *) b->data)[2] = mode;
((int32_t *) b->data)[3] = n_ctx; ((int32_t *) b->data)[3] = n_ctx;
memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
ggml_scratch_load(ctx); ggml_scratch_load(ctx);
@ -7023,7 +6998,7 @@ struct ggml_tensor * ggml_rope(
int n_dims, int n_dims,
int mode, int mode,
int n_ctx) { int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false); return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, false);
} }
struct ggml_tensor * ggml_rope_inplace( struct ggml_tensor * ggml_rope_inplace(
@ -7033,7 +7008,19 @@ struct ggml_tensor * ggml_rope_inplace(
int n_dims, int n_dims,
int mode, int mode,
int n_ctx) { int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true); return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true);
}
struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
float freq_base,
float freq_scale,
int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true);
} }
// ggml_rope_back // ggml_rope_back
@ -12104,16 +12091,21 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32); GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) == 4); GGML_ASSERT(ggml_nelements(src1) == 6);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
} }
float freq_base;
float freq_scale;
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1]; const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3]; const int n_ctx = ((int32_t *) src1->data)[3];
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
assert(n_past >= 0); assert(n_past >= 0);
@ -12142,7 +12134,7 @@ static void ggml_compute_forward_rope_f32(
// row index used to determine which thread to use // row index used to determine which thread to use
int ir = 0; int ir = 0;
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); const float theta_scale = powf(freq_base, -2.0f/n_dims);
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const bool is_glm = mode & 4; const bool is_glm = mode & 4;
@ -12154,7 +12146,7 @@ static void ggml_compute_forward_rope_f32(
if (ir++ < ir0) continue; if (ir++ < ir0) continue;
if (ir > ir1) break; if (ir > ir1) break;
float theta = (float)p; float theta = freq_scale * (float)p;
if (is_glm) { if (is_glm) {
theta = MIN(p, n_ctx - 2); theta = MIN(p, n_ctx - 2);
@ -12182,9 +12174,6 @@ static void ggml_compute_forward_rope_f32(
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta; dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
} }
} else if (!is_neox) { } else if (!is_neox) {
if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
theta = theta * GGML_TRAINING_CTX / n_ctx;
}
for (int64_t i0 = 0; i0 < ne0; i0 += 2) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta); const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta); const float sin_theta = sinf(theta);
@ -12234,16 +12223,21 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32); GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) == 4); GGML_ASSERT(ggml_nelements(src1) == 6);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
} }
float freq_base;
float freq_scale;
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1]; const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3]; const int n_ctx = ((int32_t *) src1->data)[3];
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
assert(n_past >= 0); assert(n_past >= 0);
@ -12272,7 +12266,7 @@ static void ggml_compute_forward_rope_f16(
// row index used to determine which thread to use // row index used to determine which thread to use
int ir = 0; int ir = 0;
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); const float theta_scale = powf(freq_base, -2.0f/n_dims);
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const bool is_glm = mode & 4; const bool is_glm = mode & 4;
@ -12284,7 +12278,7 @@ static void ggml_compute_forward_rope_f16(
if (ir++ < ir0) continue; if (ir++ < ir0) continue;
if (ir > ir1) break; if (ir > ir1) break;
float theta = (float)p; float theta = freq_scale * (float)p;
if (is_glm) { if (is_glm) {
theta = MIN(p, n_ctx - 2); theta = MIN(p, n_ctx - 2);
@ -12312,9 +12306,6 @@ static void ggml_compute_forward_rope_f16(
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta); dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
} }
} if (!is_neox) { } if (!is_neox) {
if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
theta = theta * GGML_TRAINING_CTX / n_ctx;
}
for (int64_t i0 = 0; i0 < ne0; i0 += 2) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta); const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta); const float sin_theta = sinf(theta);
@ -12348,7 +12339,7 @@ static void ggml_compute_forward_rope_f16(
const float x0 = GGML_FP16_TO_FP32(src[0]); const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
} }
} }
@ -12400,7 +12391,6 @@ static void ggml_compute_forward_rope_back_f32(
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1]; const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
assert(n_past >= 0); assert(n_past >= 0);
@ -12426,7 +12416,7 @@ static void ggml_compute_forward_rope_back_f32(
// row index used to determine which thread to use // row index used to determine which thread to use
int ir = 0; int ir = 0;
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); const float theta_scale = powf(10000.0, -2.0f/n_dims);
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
@ -12440,9 +12430,6 @@ static void ggml_compute_forward_rope_back_f32(
float theta = (float)p; float theta = (float)p;
if (!is_neox) { if (!is_neox) {
if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
theta = theta * GGML_TRAINING_CTX / n_ctx;
}
for (int64_t i0 = 0; i0 < ne0; i0 += 2) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta); const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta); const float sin_theta = sinf(theta);
@ -12503,7 +12490,6 @@ static void ggml_compute_forward_rope_back_f16(
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1]; const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
assert(n_past >= 0); assert(n_past >= 0);
@ -12529,7 +12515,7 @@ static void ggml_compute_forward_rope_back_f16(
// row index used to determine which thread to use // row index used to determine which thread to use
int ir = 0; int ir = 0;
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); const float theta_scale = powf(10000.0, -2.0f/n_dims);
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
@ -12543,9 +12529,6 @@ static void ggml_compute_forward_rope_back_f16(
float theta = (float)p; float theta = (float)p;
if (!is_neox) { if (!is_neox) {
if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
theta = theta * GGML_TRAINING_CTX / n_ctx;
}
for (int64_t i0 = 0; i0 < ne0; i0 += 2) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta); const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta); const float sin_theta = sinf(theta);
@ -15754,7 +15737,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
// necessary for llama // necessary for llama
if (src0->grad) { if (src0->grad) {
assert(src1->type == GGML_TYPE_I32); assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 4); assert(ggml_nelements(src1) == 6);
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1]; const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) src1->data)[2];
@ -15775,7 +15758,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{ {
if (src0->grad) { if (src0->grad) {
assert(src1->type == GGML_TYPE_I32); assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 4); assert(ggml_nelements(src1) == 3);
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1]; const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) src1->data)[2];
@ -16335,8 +16318,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (GGML_OP_HAS_FINALIZE[node->op]) { if (GGML_OP_HAS_FINALIZE[node->op]) {
params.nth = n_tasks_arr[node_n]; params.nth = n_tasks_arr[node_n];
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
} }
ggml_graph_compute_perf_stats_node(node, state->shared);
} }
// distribute new work or execute it direct if 1T // distribute new work or execute it direct if 1T
@ -16366,8 +16349,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (GGML_OP_HAS_FINALIZE[node->op]) { if (GGML_OP_HAS_FINALIZE[node->op]) {
params.type = GGML_TASK_FINALIZE; params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
} }
ggml_graph_compute_perf_stats_node(node, state->shared);
} else { } else {
break; break;
} }
@ -16909,9 +16893,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
} }
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
//assert(cgraph->work == NULL);
//assert(cgraph->work_size == 0);
uint64_t size_eval = 0; uint64_t size_eval = 0;
// compute size of intermediate results // compute size of intermediate results
@ -17350,9 +17331,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT("=== GRAPH ===\n"); GGML_PRINT("=== GRAPH ===\n");
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i]; struct ggml_tensor * node = cgraph->nodes[i];

21
ggml.h
View file

@ -207,12 +207,6 @@
#define GGML_UNUSED(x) (void)(x) #define GGML_UNUSED(x) (void)(x)
// Maximum training context of the model in use
// For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B)
#ifndef GGML_TRAINING_CTX
#define GGML_TRAINING_CTX 2048
#endif
#define GGML_ASSERT(x) \ #define GGML_ASSERT(x) \
do { \ do { \
if (!(x)) { \ if (!(x)) { \
@ -551,10 +545,6 @@ extern "C" {
// use this to compute the memory overhead of a tensor // use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void); GGML_API size_t ggml_tensor_overhead(void);
GGML_API void set_ntk_rope_scale_mode(bool useNtk);
GGML_API bool get_ntk_rope_scale_mode();
GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
// main // main
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@ -1130,6 +1120,17 @@ extern "C" {
int mode, int mode,
int n_ctx); int n_ctx);
// custom RoPE, in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
float freq_base,
float freq_scale,
int n_ctx);
// rotary position embedding backward, i.e compute dx from dy // rotary position embedding backward, i.e compute dx from dy
// a - dy // a - dy
GGML_API struct ggml_tensor * ggml_rope_back( GGML_API struct ggml_tensor * ggml_rope_back(

View file

@ -348,12 +348,32 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
= gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
= mpt_ctx_v3.hparams.n_ctx = params.n_ctx; = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
//handle linear rope //determine rope scaling params
if(inputs.linear_rope) float rope_freq_scale = 1.0f;
float rope_freq_base = 10000.0f;
if(inputs.rope_freq_scale>0.0f)
{ {
printf("Using Linear RoPE scaling instead of NTK-Aware scaling.\n"); rope_freq_scale = inputs.rope_freq_scale;
rope_freq_base = inputs.rope_freq_base;
printf("Using Custom RoPE scaling (scale:%.3f, base:%.1f).\n",rope_freq_scale,rope_freq_base);
} }
set_ntk_rope_scale_mode(!inputs.linear_rope); else
{
rope_freq_scale = 1.0f;
if (params.n_ctx <= 2048) //normie mode
{
rope_freq_base = 10000.0f;
}
else
{
//approximate NTK aware ctx
rope_freq_base = (params.n_ctx <= 4096 ? 40880.0f : 82684.0f);
}
printf("Using automatic RoPE scaling (scale:%.3f, base:%.1f)\n",rope_freq_scale,rope_freq_base);
}
gptj_ctx_v3.hparams.rope_freq_scale = neox_ctx_v3.hparams.rope_freq_scale = rope_freq_scale;
gptj_ctx_v3.hparams.rope_freq_base = neox_ctx_v3.hparams.rope_freq_base = rope_freq_base;
//handle custom token bans //handle custom token bans
banned_tokens.clear(); banned_tokens.clear();
@ -444,6 +464,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
llama_ctx_params.use_mlock = inputs.use_mlock; llama_ctx_params.use_mlock = inputs.use_mlock;
llama_ctx_params.n_gpu_layers = inputs.gpulayers; llama_ctx_params.n_gpu_layers = inputs.gpulayers;
llama_ctx_params.main_gpu = cu_parseinfo_maindevice; llama_ctx_params.main_gpu = cu_parseinfo_maindevice;
llama_ctx_params.rope_freq_base = rope_freq_base;
llama_ctx_params.rope_freq_scale = rope_freq_scale;
llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params); llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params);

View file

@ -36,7 +36,8 @@ class load_model_inputs(ctypes.Structure):
("debugmode", ctypes.c_int), ("debugmode", ctypes.c_int),
("forceversion", ctypes.c_int), ("forceversion", ctypes.c_int),
("gpulayers", ctypes.c_int), ("gpulayers", ctypes.c_int),
("linear_rope", ctypes.c_bool), ("rope_freq_scale", ctypes.c_float),
("rope_freq_base", ctypes.c_float),
("banned_tokens", ctypes.c_char_p * ban_token_max)] ("banned_tokens", ctypes.c_char_p * ban_token_max)]
class generation_inputs(ctypes.Structure): class generation_inputs(ctypes.Structure):
@ -189,7 +190,11 @@ def load_model(model_filename):
inputs.blasbatchsize = args.blasbatchsize inputs.blasbatchsize = args.blasbatchsize
inputs.forceversion = args.forceversion inputs.forceversion = args.forceversion
inputs.gpulayers = args.gpulayers inputs.gpulayers = args.gpulayers
inputs.linear_rope = args.linearrope inputs.rope_freq_scale = args.ropeconfig[0]
if len(args.ropeconfig)>1:
inputs.rope_freq_base = args.ropeconfig[1]
else:
inputs.rope_freq_base = 10000
clblastids = 0 clblastids = 0
if args.useclblast: if args.useclblast:
clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1]) clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])
@ -1434,7 +1439,7 @@ if __name__ == '__main__':
parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true') parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192], default=2048) parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192], default=2048)
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512) parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512)
parser.add_argument("--linearrope", help="If set, uses linear RoPE scaling. Otherwise, uses NTK-Aware scaling.", action='store_true') parser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true') parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true') parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents the EOS token from being generated. This flag unbans it.", action='store_true') parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents the EOS token from being generated. This flag unbans it.", action='store_true')

107
llama.cpp
View file

@ -102,14 +102,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
// memory sizes // memory sizes
// //
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0() static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 256ull * MB }, /* empirical scaling, still a guess */
{ MODEL_7B, 512ull * MB }, { MODEL_3B, ((size_t) n_ctx / 16ull + 180ull) * MB },
{ MODEL_13B, 512ull * MB }, { MODEL_7B, ((size_t) n_ctx / 16ull + 320ull) * MB },
{ MODEL_30B, 640ull * MB }, { MODEL_13B, ((size_t) n_ctx / 12ull + 460ull) * MB },
{ MODEL_65B, 1024ull * MB }, { MODEL_30B, ((size_t) n_ctx / 10ull + 620ull) * MB },
{ MODEL_65B, ((size_t) n_ctx / 8ull + 860ull) * MB },
}; };
return k_sizes; return k_sizes;
} }
@ -141,14 +142,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
// this is mostly needed for temporary mul_mat buffers to dequantize the data // this is mostly needed for temporary mul_mat buffers to dequantize the data
// not actually needed if BLAS is disabled // not actually needed if BLAS is disabled
static const std::map<e_model, size_t> & MEM_REQ_EVAL() static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 512ull * MB }, { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
{ MODEL_7B, 800ull * MB }, { MODEL_7B, ((size_t) n_ctx / 256ull + 800ull) * MB },
{ MODEL_13B, 1024ull * MB }, { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
{ MODEL_30B, 1380ull * MB }, { MODEL_30B, ((size_t) n_ctx / 256ull + 1380ull) * MB },
{ MODEL_65B, 1536ull * MB }, { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
}; };
return k_sizes; return k_sizes;
} }
@ -190,6 +191,10 @@ struct llama_hparams {
uint32_t n_head = 32; uint32_t n_head = 32;
uint32_t n_layer = 32; uint32_t n_layer = 32;
uint32_t n_rot = 64; uint32_t n_rot = 64;
float rope_freq_base = 10000.0f;
float rope_freq_scale = 1.0f;
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
bool operator!=(const llama_hparams & other) const { bool operator!=(const llama_hparams & other) const {
@ -648,7 +653,7 @@ struct llama_model_loader {
*ctx_size_p = *mmapped_size_p = 0; *ctx_size_p = *mmapped_size_p = 0;
for (const llama_load_tensor & lt : tensors_map.tensors) { for (const llama_load_tensor & lt : tensors_map.tensors) {
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
} }
} }
@ -844,6 +849,8 @@ struct llama_context_params llama_context_default_params() {
/*.gpu_layers =*/ 0, /*.gpu_layers =*/ 0,
/*.main_gpu =*/ 0, /*.main_gpu =*/ 0,
/*.tensor_split =*/ {0}, /*.tensor_split =*/ {0},
/*.rope_freq_base =*/ 10000.0f,
/*.rope_freq_scale =*/ 1.0f,
/*.progress_callback =*/ nullptr, /*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr, /*.progress_callback_user_data =*/ nullptr,
/*.low_vram =*/ false, /*.low_vram =*/ false,
@ -967,6 +974,8 @@ static void llama_model_load_internal(
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
const float * tensor_split, const float * tensor_split,
float rope_freq_base,
float rope_freq_scale,
bool low_vram, bool low_vram,
ggml_type memory_type, ggml_type memory_type,
bool use_mmap, bool use_mmap,
@ -1001,22 +1010,27 @@ static void llama_model_load_internal(
} }
hparams.n_ctx = n_ctx; hparams.n_ctx = n_ctx;
hparams.rope_freq_base = rope_freq_base;
hparams.rope_freq_scale = rope_freq_scale;
} }
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
{ {
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
} }
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@ -1162,13 +1176,12 @@ static void llama_model_load_internal(
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
// this is the total memory required to run the inference // this is the total memory required to run the inference
const size_t bigctxmul = (hparams.n_ctx>4096?3:(hparams.n_ctx>2048?2:1));
const size_t mem_required = const size_t mem_required =
ctx_size + ctx_size +
mmapped_size - vram_weights + // weights in VRAM not in memory mmapped_size - vram_weights + // weights in VRAM not in memory
MEM_REQ_SCRATCH0().at(model.type)*bigctxmul + MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
MEM_REQ_SCRATCH1().at(model.type)*bigctxmul + MEM_REQ_SCRATCH1().at(model.type) +
MEM_REQ_EVAL().at (model.type); MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
// this is the memory required by one llama_state // this is the memory required by one llama_state
const size_t mem_required_state = const size_t mem_required_state =
@ -1272,6 +1285,8 @@ static bool llama_model_load(
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
float * tensor_split, float * tensor_split,
float rope_freq_base,
float rope_freq_scale,
bool low_vram, bool low_vram,
ggml_type memory_type, ggml_type memory_type,
bool use_mmap, bool use_mmap,
@ -1280,7 +1295,7 @@ static bool llama_model_load(
llama_progress_callback progress_callback, llama_progress_callback progress_callback,
void *progress_callback_user_data) { void *progress_callback_user_data) {
try { try {
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
return true; return true;
} catch (const std::exception & err) { } catch (const std::exception & err) {
@ -1332,6 +1347,9 @@ static bool llama_eval_internal(
const int n_rot = hparams.n_embd/hparams.n_head; const int n_rot = hparams.n_embd/hparams.n_head;
const int n_gpu_layers = model.n_gpu_layers; const int n_gpu_layers = model.n_gpu_layers;
const float freq_base = hparams.rope_freq_base;
const float freq_scale = hparams.rope_freq_scale;
auto & mem_per_token = lctx.mem_per_token; auto & mem_per_token = lctx.mem_per_token;
auto & buf_compute = lctx.buf_compute; auto & buf_compute = lctx.buf_compute;
@ -1429,11 +1447,11 @@ static bool llama_eval_internal(
offload_func_kq(tmpq); offload_func_kq(tmpq);
ggml_set_name(tmpq, "tmpq"); ggml_set_name(tmpq, "tmpq");
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx); struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
offload_func_kq(Kcur); offload_func_kq(Kcur);
ggml_set_name(Kcur, "Kcur"); ggml_set_name(Kcur, "Kcur");
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx); struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
offload_func_kq(Qcur); offload_func_kq(Qcur);
ggml_set_name(Qcur, "Qcur"); ggml_set_name(Qcur, "Qcur");
@ -2007,9 +2025,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
} }
// Normalize the second derivatives // Normalize the second derivatives
float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); {
for (float & value : second_derivatives) { const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
value /= second_derivatives_sum;
if (second_derivatives_sum > 1e-6f) {
for (float & value : second_derivatives) {
value /= second_derivatives_sum;
}
} else {
for (float & value : second_derivatives) {
value = 1.0f / second_derivatives.size();
}
}
} }
float cum_sum = 0.0f; float cum_sum = 0.0f;
@ -2188,7 +2215,7 @@ void llama_sample_classifier_free_guidance(
struct llama_context * guidance_ctx, struct llama_context * guidance_ctx,
float scale, float scale,
float smooth_factor) { float smooth_factor) {
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us(); int64_t t_start_sample_us = ggml_time_us();
assert(ctx); assert(ctx);
auto n_vocab = llama_n_vocab(ctx); auto n_vocab = llama_n_vocab(ctx);
@ -2676,8 +2703,9 @@ struct llama_model * llama_load_model_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
params.progress_callback_user_data)) {
delete model; delete model;
fprintf(stderr, "%s: failed to load model\n", __func__); fprintf(stderr, "%s: failed to load model\n", __func__);
return nullptr; return nullptr;
@ -2752,11 +2780,10 @@ struct llama_context * llama_new_context_with_model(
ctx->embedding.resize(hparams.n_embd); ctx->embedding.resize(hparams.n_embd);
} }
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
const size_t bigctxmul = (hparams.n_ctx>4096?3:(hparams.n_ctx>2048?2:1)); ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)*bigctxmul); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)*bigctxmul);
} }
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL

View file

@ -89,6 +89,11 @@ extern "C" {
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors int32_t main_gpu; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency
float rope_freq_scale; // RoPE frequency scaling factor
// called with a progress value between 0 and 1, pass NULL to disable // called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback; llama_progress_callback progress_callback;
// context pointer passed to the progress callback // context pointer passed to the progress callback

View file

@ -409,6 +409,9 @@ bool gptj_eval(
const int n_vocab = hparams.n_vocab; const int n_vocab = hparams.n_vocab;
const int n_rot = hparams.n_rot; const int n_rot = hparams.n_rot;
const float freq_base = hparams.rope_freq_base;
const float freq_scale = hparams.rope_freq_scale;
static size_t buf_size = 256u*1024*1024; static size_t buf_size = 256u*1024*1024;
static void * buf = malloc(buf_size); static void * buf = malloc(buf_size);
@ -475,8 +478,8 @@ bool gptj_eval(
// self-attention // self-attention
{ {
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx); struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, n_ctx);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx); struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, n_ctx);
// store key and value to memory // store key and value to memory
{ {

View file

@ -425,6 +425,9 @@ bool gpt_neox_eval(
const int n_vocab = hparams.n_vocab; const int n_vocab = hparams.n_vocab;
const int n_rot = hparams.n_rot; const int n_rot = hparams.n_rot;
const float freq_base = hparams.rope_freq_base;
const float freq_scale = hparams.rope_freq_scale;
static size_t buf_size = 256u*1024*1024; static size_t buf_size = 256u*1024*1024;
static void * buf = malloc(buf_size); static void * buf = malloc(buf_size);
@ -503,8 +506,8 @@ bool gpt_neox_eval(
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head)); struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
// using mode = 2 for GPT-NeoX mode // using mode = 2 for GPT-NeoX mode
Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, n_ctx); Qcur = ggml_rope_custom_inplace(ctx0, Qcur, n_past, n_rot, 2, freq_base, freq_scale, n_ctx);
Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, n_ctx); Kcur = ggml_rope_custom_inplace(ctx0, Kcur, n_past, n_rot, 2, freq_base, freq_scale, n_ctx);
// store key and value to memory // store key and value to memory
{ {

View file

@ -24,6 +24,9 @@ struct gptj_hparams {
int32_t n_layer = 28; int32_t n_layer = 28;
int32_t n_rot = 64; int32_t n_rot = 64;
int32_t ftype = 1; int32_t ftype = 1;
float rope_freq_base = 10000.0f;
float rope_freq_scale = 1.0f;
}; };
struct gptj_layer { struct gptj_layer {
@ -309,6 +312,9 @@ struct gpt_neox_hparams {
int32_t n_rot = 32; // rotary_pct * (n_embd / n_head) int32_t n_rot = 32; // rotary_pct * (n_embd / n_head)
int32_t par_res = 1; // 1 = true, 0 = false int32_t par_res = 1; // 1 = true, 0 = false
int32_t ftype = 1; int32_t ftype = 1;
float rope_freq_base = 10000.0f;
float rope_freq_scale = 1.0f;
}; };
struct gpt_neox_layer_v2 { struct gpt_neox_layer_v2 {