Reworking rope WIP

2023-07-19 00:54:41 +08:00 · 2023-07-19 00:54:41 +08:00 · 374fffb9c6
commit 374fffb9c6
parent 0a11f50da8 d01bccde9f
24 changed files with 600 additions and 256 deletions
--- a/.gitignore
+++ b/.gitignore
@ -16,6 +16,8 @@ build/
 build-em/
 build-debug/
 build-release/
 build-ci-debug/
 build-ci-release/
 build-static/
 build-cublas/
 build-opencl/
@ -25,6 +27,10 @@ build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
 out/
 tmp/
 models/*
 models-mnt
 /main
 /quantize
--- a/ci/README.md
+++ b/ci/README.md
@ -0,0 +1,20 @@
 # CI
 In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
 https://github.com/ggml-org/ci
 It monitors the `master` branch for new commits and runs the
 [ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
 to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
 to cover various hardware architectures, including GPU and Apple Silicon instances.
 Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
 Only the branches of this repo are monitored for this keyword.
 It is a good practice, before publishing changes to execute the full CI locally on your machine:
 ```bash
 mkdir tmp
 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
--- a/ci/run.sh
+++ b/ci/run.sh
@ -0,0 +1,262 @@
 #/bin/bash
 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
    exit 1
 fi
 mkdir -p "$1"
 mkdir -p "$2"
 OUT=$(realpath "$1")
 MNT=$(realpath "$2")
 rm -v $OUT/*.log
 rm -v $OUT/*.exit
 rm -v $OUT/*.md
 sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`
 ## helpers
 # download a file if it does not exist or if it is outdated
 function gg_wget {
    local out=$1
    local url=$2
    local cwd=`pwd`
    mkdir -p $out
    cd $out
    # should not re-download if file is the same
    wget -nv -N $url
    cd $cwd
 }
 function gg_printf {
    printf -- "$@" >> $OUT/README.md
 }
 function gg_run {
    ci=$1
    set -o pipefail
    set -x
    gg_run_$ci | tee $OUT/$ci.log
    cur=$?
    echo "$cur" > $OUT/$ci.exit
    set +x
    set +o pipefail
    gg_sum_$ci
    ret=$((ret | cur))
 }
 ## ci
 # ctest_debug
 function gg_run_ctest_debug {
    cd ${SRC}
    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Debug ..     ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    set +e
 }
 function gg_sum_ctest_debug {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs ctest in debug mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
    gg_printf '```\n'
    gg_printf '\n'
 }
 # ctest_release
 function gg_run_ctest_release {
    cd ${SRC}
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
    if [ -z $GG_BUILD_LOW_PERF ]; then
        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    else
        (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    fi
    set +e
 }
 function gg_sum_ctest_release {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs ctest in release mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
    gg_printf '```\n'
 }
 # open_llama_3b_v2
 function gg_run_open_llama_3b_v2 {
    cd ${SRC}
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
    path_models="../models-mnt/open-llama/3B-v2"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert.py ${path_models}
    model_f16="${path_models}/ggml-model-f16.bin"
    model_q8_0="${path_models}/ggml-model-q8_0.bin"
    model_q4_0="${path_models}/ggml-model-q4_0.bin"
    model_q4_1="${path_models}/ggml-model-q4_1.bin"
    model_q5_0="${path_models}/ggml-model-q5_0.bin"
    model_q5_1="${path_models}/ggml-model-q5_1.bin"
    model_q3_k="${path_models}/ggml-model-q3_k.bin"
    model_q4_k="${path_models}/ggml-model-q4_k.bin"
    model_q5_k="${path_models}/ggml-model-q5_k.bin"
    model_q6_k="${path_models}/ggml-model-q6_k.bin"
    wiki_test_60="${path_wiki}/wiki.test-60.raw"
    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
            return 20
        fi
        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
        return 0
    }
    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    set +e
 }
 function gg_sum_open_llama_3b_v2 {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'OpenLLaMA 3B-v2:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
 }
 ## main
 if [ -z $GG_BUILD_LOW_PERF ]; then
    rm -rf ${SRC}/models-mnt
    mnt_models=$(realpath ${MNT}/models)
    mkdir -p ${mnt_models}
    ln -sfn ${mnt_models} ${SRC}/models-mnt
    python3 -m pip install -r ${SRC}/requirements.txt
 fi
 ret=0
 #test $ret -eq 0 && gg_run ctest_debug
 #test $ret -eq 0 && gg_run ctest_release
 if [ -z $GG_BUILD_LOW_PERF ]; then
    test $ret -eq 0 && gg_run open_llama_3b_v2
 fi
 exit $ret
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -168,6 +168,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rope_freq_base = std::stof(argv[i]);
        } else if (arg == "--rope-freq-scale") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rope_freq_scale = std::stof(argv[i]);
        } else if (arg == "--memory-f32") {
            params.memory_f16 = false;
        } else if (arg == "--top-p") {
@ -267,6 +279,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_keep = std::stoi(argv[i]);
        } else if (arg == "--chunks") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_chunks = std::stoi(argv[i]);
        } else if (arg == "-m" || arg == "--model") {
            if (++i >= argc) {
                invalid_param = true;
@ -493,6 +511,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
    fprintf(stderr, "  --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stderr, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
    fprintf(stderr, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
@ -501,6 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    fprintf(stderr, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    if (llama_mlock_supported()) {
        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
@ -573,6 +594,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    lparams.use_mlock    = params.use_mlock;
    lparams.logits_all   = params.perplexity;
    lparams.embedding    = params.embedding;
    lparams.rope_freq_base  = params.rope_freq_base;
    lparams.rope_freq_scale = params.rope_freq_scale;
    return lparams;
 }
--- a/examples/common.h
+++ b/examples/common.h
@ -28,10 +28,13 @@ struct gpt_params {
    int32_t n_ctx                           = 512; // context size
    int32_t n_batch                         = 512; // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                          = 0;   // number of tokens to keep from initial prompt
    int32_t n_chunks                        = -1;  // max number of chunks to process (-1 = unlimited)
    int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
    int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;   // if greater than 0, output the probabilities of top n_probs tokens.
    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
    // sampling parameters
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -84,9 +84,17 @@ int main(int argc, char ** argv) {
        return 0;
    }
    if (params.rope_freq_base != 10000.0) {
        fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
    }
    if (params.rope_freq_scale != 1.0) {
        fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
    }
    if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
+        fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);"
-                "expect poor results\n", __func__, params.n_ctx);
+                " you are on your own\n", __func__, params.n_ctx);
    } else if (params.n_ctx < 8) {
        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -32,13 +32,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    // BOS tokens will be added for each chunk before eval
    auto tokens = ::llama_tokenize(ctx, params.prompt, true);
-    int count   = 0;
+    const int n_chunk_max = tokens.size() / params.n_ctx;
-    const int n_chunk = tokens.size() / params.n_ctx;
+    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
    const int n_vocab = llama_n_vocab(ctx);
    const int n_batch = params.n_batch;
    int count = 0;
    double nll = 0.0;
    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
    for (int i = 0; i < n_chunk; ++i) {
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -12,103 +12,27 @@ struct quant_option {
 };
 static const std::vector<struct quant_option> QUANT_OPTIONS = {
-    {
+    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 3.50G, +0.2499 ppl @ 7B", },
-        "Q4_0",
+    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1846 ppl @ 7B", },
-        LLAMA_FTYPE_MOSTLY_Q4_0,
+    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.30G, +0.0796 ppl @ 7B", },
-        " 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
+    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0415 ppl @ 7B", },
    },
    {
        "Q4_1",
        LLAMA_FTYPE_MOSTLY_Q4_1,
        " 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
    },
    {
        "Q5_0",
        LLAMA_FTYPE_MOSTLY_Q5_0,
        " 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
    },
    {
        "Q5_1",
        LLAMA_FTYPE_MOSTLY_Q5_1,
        " 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
    },
 #ifdef GGML_USE_K_QUANTS
-    {
+    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.67G, +0.8698 ppl @ 7B", },
-        "Q2_K",
+    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
-        LLAMA_FTYPE_MOSTLY_Q2_K,
+    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
-        " 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
+    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
-    },
+    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
-    {
+    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
-        "Q3_K",
+    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
-        LLAMA_FTYPE_MOSTLY_Q3_K_M,
+    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
-        "alias for Q3_K_M"
+    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
-    },
+    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
-    {
+    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
-        "Q3_K_S",
+    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0044 ppl @ 7B", },
        LLAMA_FTYPE_MOSTLY_Q3_K_S,
        " 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
    },
    {
        "Q3_K_M",
        LLAMA_FTYPE_MOSTLY_Q3_K_M,
        " 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
    },
    {
        "Q3_K_L",
        LLAMA_FTYPE_MOSTLY_Q3_K_L,
        " 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
    },
    {
        "Q4_K",
        LLAMA_FTYPE_MOSTLY_Q4_K_M,
        "alias for Q4_K_M",
    },
    {
        "Q4_K_S",
        LLAMA_FTYPE_MOSTLY_Q4_K_S,
        " 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
    },
    {
        "Q4_K_M",
        LLAMA_FTYPE_MOSTLY_Q4_K_M,
        " 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
    },
    {
        "Q5_K",
        LLAMA_FTYPE_MOSTLY_Q5_K_M,
        "alias for Q5_K_M",
    },
    {
        "Q5_K_S",
        LLAMA_FTYPE_MOSTLY_Q5_K_S,
        " 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
    },
    {
        "Q5_K_M",
        LLAMA_FTYPE_MOSTLY_Q5_K_M,
        " 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
    },
    {
        "Q6_K",
        LLAMA_FTYPE_MOSTLY_Q6_K,
        " 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
    },
 #endif
-    {
+    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ 7B", },
-        "Q8_0",
+    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
-        LLAMA_FTYPE_MOSTLY_Q8_0,
+    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
        " 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
    },
    {
        "F16",
        LLAMA_FTYPE_MOSTLY_F16,
        "13.00G              @ 7B - extremely large, virtually no quality loss - not recommended",
    },
    {
        "F32",
        LLAMA_FTYPE_ALL_F32,
        "26.00G              @ 7B - absolutely huge, lossless - not recommended",
    },
 };
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -66,6 +66,7 @@ Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the
 ```sh
 curl --request POST \
    --url http://localhost:8080/completion \
    --header "Content-Type: application/json" \
    --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
 ```
--- a/examples/server/chat.sh
+++ b/examples/server/chat.sh
@ -32,6 +32,7 @@ tokenize() {
        --silent \
        --request POST \
        --url "${API_URL}/tokenize" \
        --header "Content-Type: application/json" \
        --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
    | jq '.tokens[]'
 }
@ -64,6 +65,7 @@ chat_completion() {
        --no-buffer \
        --request POST \
        --url "${API_URL}/completion" \
        --header "Content-Type: application/json" \
        --data-raw "${DATA}")
    printf "\n"
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -608,6 +608,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    fprintf(stderr, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stderr, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
    fprintf(stderr, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
    fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
    fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
@ -722,6 +724,22 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_ctx = std::stoi(argv[i]);
        }
        else if (arg == "--rope-freq-base")
        {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rope_freq_base = std::stof(argv[i]);
        }
        else if (arg == "--rope-freq-scale")
        {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rope_freq_scale = std::stof(argv[i]);
        }
        else if (arg == "--memory-f32" || arg == "--memory_f32")
        {
            params.memory_f16 = false;
--- a/expose.h
+++ b/expose.h
@ -36,7 +36,8 @@ struct load_model_inputs
    const int debugmode = 0;
    const int forceversion = 0;
    const int gpulayers = 0;
-    const bool linear_rope;
+    const float rope_freq_scale = 1.0f;
    const float rope_freq_base = 10000.0f;
    const char * banned_tokens[ban_token_max];
 };
 struct generation_inputs
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -2977,10 +2977,8 @@ inline void ggml_cuda_op_rope(
    const int mode   = ((int32_t *) src1->data)[2];
    const int n_ctx  = ((int32_t *) src1->data)[3];
-    const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
+    const float theta_scale = powf(10000.0, -2.0f/n_dims);
-    const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02);
+    const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
    const float p = get_ntk_rope_scale_mode()?p0:(n_ctx <= GGML_TRAINING_CTX ? p0 : p0 * GGML_TRAINING_CTX / n_ctx);
    bool is_glm = mode & 4;
@ -3564,6 +3562,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
    (void) dst;
 }
 void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_cpy(src0, dst, nullptr);
    (void) src1;
 }
 void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
@ -3697,7 +3700,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
    // recursively assign CUDA buffers until a compute tensor is found
    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
        const ggml_op src0_op = tensor->src[0]->op;
-        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
+        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
        }
    }
@ -3803,6 +3806,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
    switch (tensor->op) {
        case GGML_OP_DUP:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_dup;
            break;
        case GGML_OP_ADD:
            if (!any_on_device) {
                return false;
@ -3857,6 +3866,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
            }
            func = ggml_cuda_cpy;
            break;
        case GGML_OP_CONT:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_dup;
            break;
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -881,28 +881,35 @@ void ggml_metal_graph_compute(
                            const int n_past = ((int32_t *)(src1->data))[0];
                            float freq_base;
                            float freq_scale;
                            memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
                            memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
                            [encoder setComputePipelineState:ctx->pipeline_rope];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00   length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01   length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02   length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03   length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00   length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01   length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02   length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03   length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0    length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1    length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2    length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3    length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0    length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1    length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&n_past length:sizeof(     int) atIndex:18];
+                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:18];
-                            [encoder setBytes:&n_dims length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&mode   length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:20];
                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -656,17 +656,19 @@ kernel void kernel_rope(
        constant       int & n_past,
        constant       int & n_dims,
        constant       int & mode,
        constant     float & freq_base,
        constant     float & freq_scale,
        uint3 tpig[[thread_position_in_grid]]) {
    const int64_t i3 = tpig[2];
    const int64_t i2 = tpig[1];
    const int64_t i1 = tpig[0];
    const bool is_neox = mode & 2;
-    const float theta_scale = pow(10000.0, -2.0f/n_dims);
+    const float theta_scale = pow(freq_base, -2.0f/n_dims);
    const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
-    float theta = (float)p;
+    float theta = freq_scale * (float)p;
    if (!is_neox) {
        for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
--- a/ggml.c
+++ b/ggml.c
@ -4292,35 +4292,6 @@ static inline int ggml_up(int n, int m) {
 #define ggml_assert_aligned(ptr) \
    GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
 static bool useNtkRope = true; //uses linear rope if not NTK
 void set_ntk_rope_scale_mode(bool useNtk)
 {
    useNtkRope = useNtk;
 }
 bool get_ntk_rope_scale_mode()
 {
    return useNtkRope;
 }
 float get_theta_scale(int n_dims,int n_past,int n_ctx)
 {
    if (!get_ntk_rope_scale_mode())
    {
        return powf(10000.0, -2.0f / n_dims);
    }
    if (n_ctx <= 2048) //normie mode
    {
        return powf(10000.0, -2.0f / n_dims);
    }
    else
    {
        //using scaled NTK aware ctx
        float a = (n_ctx <= 4096 ? 4.0 : 8.0);
        float m = powf(a, n_dims / (n_dims - 2.0));
        float s = powf(10000.0 * m, -2.0f / n_dims);
        return s;
    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 struct ggml_context * ggml_init(struct ggml_init_params params) {
@ -4442,8 +4413,8 @@ void ggml_free(struct ggml_context * ctx) {
        if (&g_state.contexts[i].context == ctx) {
            g_state.contexts[i].used = false;
-            GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
+            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
-                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
+                    __func__, i, ggml_used_mem(ctx));
            if (ctx->mem_buffer_owned) {
                GGML_ALIGNED_FREE(ctx->mem_buffer);
@ -6986,6 +6957,8 @@ struct ggml_tensor * ggml_rope_impl(
        int                   n_past,
        int                   n_dims,
        int                   mode,
        float                 freq_base,
        float                 freq_scale,
        int                   n_ctx,
        bool                  inplace) {
    GGML_ASSERT(n_past >= 0);
@ -6999,12 +6972,14 @@ struct ggml_tensor * ggml_rope_impl(
    ggml_scratch_save(ctx);
-    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
+    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
    ((int32_t *) b->data)[0] = n_past;
    ((int32_t *) b->data)[1] = n_dims;
    ((int32_t *) b->data)[2] = mode;
    ((int32_t *) b->data)[3] = n_ctx;
    memcpy((int32_t *) b->data + 4, &freq_base,  sizeof(float));
    memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
    ggml_scratch_load(ctx);
@ -7023,7 +6998,7 @@ struct ggml_tensor * ggml_rope(
        int                   n_dims,
        int                   mode,
        int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, false);
 }
 struct ggml_tensor * ggml_rope_inplace(
@ -7033,7 +7008,19 @@ struct ggml_tensor * ggml_rope_inplace(
        int                   n_dims,
        int                   mode,
        int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true);
 }
 struct ggml_tensor * ggml_rope_custom_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past,
        int                   n_dims,
        int                   mode,
        float                 freq_base,
        float                 freq_scale,
        int                   n_ctx) {
    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true);
 }
 // ggml_rope_back
@ -12104,16 +12091,21 @@ static void ggml_compute_forward_rope_f32(
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_nelements(src1) == 4);
+    GGML_ASSERT(ggml_nelements(src1) == 6);
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    float freq_base;
    float freq_scale;
    const int n_past = ((int32_t *) src1->data)[0];
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
    const int n_ctx  = ((int32_t *) src1->data)[3];
    memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
    memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
    assert(n_past >= 0);
@ -12142,7 +12134,7 @@ static void ggml_compute_forward_rope_f32(
    // row index used to determine which thread to use
    int ir = 0;
-    const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
@ -12154,7 +12146,7 @@ static void ggml_compute_forward_rope_f32(
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
-                float theta = (float)p;
+                float theta = freq_scale * (float)p;
                if (is_glm) {
                    theta = MIN(p, n_ctx - 2);
@ -12182,9 +12174,6 @@ static void ggml_compute_forward_rope_f32(
                        dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
                    }
                } else if (!is_neox) {
                    if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
                        theta = theta * GGML_TRAINING_CTX / n_ctx;
                    }
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cosf(theta);
                        const float sin_theta = sinf(theta);
@ -12234,16 +12223,21 @@ static void ggml_compute_forward_rope_f16(
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_nelements(src1) == 4);
+    GGML_ASSERT(ggml_nelements(src1) == 6);
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    float freq_base;
    float freq_scale;
    const int n_past = ((int32_t *) src1->data)[0];
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
    const int n_ctx  = ((int32_t *) src1->data)[3];
    memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
    memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
    assert(n_past >= 0);
@ -12272,7 +12266,7 @@ static void ggml_compute_forward_rope_f16(
    // row index used to determine which thread to use
    int ir = 0;
-    const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
@ -12284,7 +12278,7 @@ static void ggml_compute_forward_rope_f16(
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;
-                float theta = (float)p;
+                float theta = freq_scale * (float)p;
                if (is_glm) {
                    theta = MIN(p, n_ctx - 2);
@ -12312,9 +12306,6 @@ static void ggml_compute_forward_rope_f16(
                        dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
                    }
                } if (!is_neox) {
                    if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
                        theta = theta * GGML_TRAINING_CTX / n_ctx;
                    }
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cosf(theta);
                        const float sin_theta = sinf(theta);
@ -12348,7 +12339,7 @@ static void ggml_compute_forward_rope_f16(
                            const float x0 = GGML_FP16_TO_FP32(src[0]);
                            const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
-                            dst_data[0]     = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
                            dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                        }
                    }
@ -12400,7 +12391,6 @@ static void ggml_compute_forward_rope_back_f32(
    const int n_past = ((int32_t *) src1->data)[0];
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
    const int n_ctx  = ((int32_t *) src1->data)[3];
    assert(n_past >= 0);
@ -12426,7 +12416,7 @@ static void ggml_compute_forward_rope_back_f32(
    // row index used to determine which thread to use
    int ir = 0;
-    const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
+    const float theta_scale = powf(10000.0, -2.0f/n_dims);
    const bool is_neox = mode & 2;
@ -12440,9 +12430,6 @@ static void ggml_compute_forward_rope_back_f32(
                float theta = (float)p;
                if (!is_neox) {
                    if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
                        theta = theta * GGML_TRAINING_CTX / n_ctx;
                    }
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cosf(theta);
                        const float sin_theta = sinf(theta);
@ -12503,7 +12490,6 @@ static void ggml_compute_forward_rope_back_f16(
    const int n_past = ((int32_t *) src1->data)[0];
    const int n_dims = ((int32_t *) src1->data)[1];
    const int mode   = ((int32_t *) src1->data)[2];
    const int n_ctx  = ((int32_t *) src1->data)[3];
    assert(n_past >= 0);
@ -12529,7 +12515,7 @@ static void ggml_compute_forward_rope_back_f16(
    // row index used to determine which thread to use
    int ir = 0;
-    const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
+    const float theta_scale = powf(10000.0, -2.0f/n_dims);
    const bool is_neox = mode & 2;
@ -12543,9 +12529,6 @@ static void ggml_compute_forward_rope_back_f16(
                float theta = (float)p;
                if (!is_neox) {
                    if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
                        theta = theta * GGML_TRAINING_CTX / n_ctx;
                    }
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cosf(theta);
                        const float sin_theta = sinf(theta);
@ -15754,7 +15737,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                // necessary for llama
                if (src0->grad) {
                    assert(src1->type == GGML_TYPE_I32);
-                    assert(ggml_nelements(src1) == 4);
+                    assert(ggml_nelements(src1) == 6);
                    const int n_past = ((int32_t *) src1->data)[0];
                    const int n_dims = ((int32_t *) src1->data)[1];
                    const int mode   = ((int32_t *) src1->data)[2];
@ -15775,7 +15758,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                if (src0->grad) {
                    assert(src1->type == GGML_TYPE_I32);
-                    assert(ggml_nelements(src1) == 4);
+                    assert(ggml_nelements(src1) == 3);
                    const int n_past = ((int32_t *) src1->data)[0];
                    const int n_dims = ((int32_t *) src1->data)[1];
                    const int mode   = ((int32_t *) src1->data)[2];
@ -16335,8 +16318,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                if (GGML_OP_HAS_FINALIZE[node->op]) {
                    params.nth = n_tasks_arr[node_n];
                    ggml_compute_forward(&params, node);
                    ggml_graph_compute_perf_stats_node(node, state->shared);
                }
                ggml_graph_compute_perf_stats_node(node, state->shared);
            }
            // distribute new work or execute it direct if 1T
@ -16366,8 +16349,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                    if (GGML_OP_HAS_FINALIZE[node->op]) {
                        params.type = GGML_TASK_FINALIZE;
                        ggml_compute_forward(&params, node);
                        ggml_graph_compute_perf_stats_node(node, state->shared);
                    }
                    ggml_graph_compute_perf_stats_node(node, state->shared);
                } else {
                    break;
                }
@ -16909,9 +16893,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
 }
 void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
    //assert(cgraph->work      == NULL);
    //assert(cgraph->work_size == 0);
    uint64_t size_eval = 0;
    // compute size of intermediate results
@ -17350,9 +17331,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
    GGML_PRINT("=== GRAPH ===\n");
    GGML_PRINT_DEBUG("n_threads       = %d\n",        cgraph->n_threads);
    GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
    GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * node = cgraph->nodes[i];
--- a/ggml.h
+++ b/ggml.h
@ -207,12 +207,6 @@
 #define GGML_UNUSED(x) (void)(x)
 // Maximum training context of the model in use
 // For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B)
 #ifndef GGML_TRAINING_CTX
 #define GGML_TRAINING_CTX 2048
 #endif
 #define GGML_ASSERT(x) \
    do { \
        if (!(x)) { \
@ -551,10 +545,6 @@ extern "C" {
    // use this to compute the memory overhead of a tensor
    GGML_API size_t ggml_tensor_overhead(void);
    GGML_API void set_ntk_rope_scale_mode(bool useNtk);
    GGML_API bool get_ntk_rope_scale_mode();
    GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
    // main
    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@ -1130,6 +1120,17 @@ extern "C" {
            int                   mode,
            int                   n_ctx);
    // custom RoPE, in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
            int                   n_dims,
            int                   mode,
            float                 freq_base,
            float                 freq_scale,
            int                   n_ctx);
    // rotary position embedding backward, i.e compute dx from dy
    // a - dy
    GGML_API struct ggml_tensor * ggml_rope_back(
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -348,12 +348,32 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
    = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
    = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
-    //handle linear rope
+    //determine rope scaling params
-    if(inputs.linear_rope)
+    float rope_freq_scale = 1.0f;
    float rope_freq_base = 10000.0f;
    if(inputs.rope_freq_scale>0.0f)
    {
-        printf("Using Linear RoPE scaling instead of NTK-Aware scaling.\n");
+        rope_freq_scale = inputs.rope_freq_scale;
        rope_freq_base = inputs.rope_freq_base;
        printf("Using Custom RoPE scaling (scale:%.3f, base:%.1f).\n",rope_freq_scale,rope_freq_base);
    }
-    set_ntk_rope_scale_mode(!inputs.linear_rope);
+    else
    {
        rope_freq_scale = 1.0f;
        if (params.n_ctx <= 2048) //normie mode
        {
            rope_freq_base = 10000.0f;
        }
        else
        {
            //approximate NTK aware ctx
            rope_freq_base = (params.n_ctx <= 4096 ? 40880.0f : 82684.0f);
        }
        printf("Using automatic RoPE scaling (scale:%.3f, base:%.1f)\n",rope_freq_scale,rope_freq_base);
    }
    gptj_ctx_v3.hparams.rope_freq_scale = neox_ctx_v3.hparams.rope_freq_scale = rope_freq_scale;
    gptj_ctx_v3.hparams.rope_freq_base = neox_ctx_v3.hparams.rope_freq_base = rope_freq_base;
    //handle custom token bans
    banned_tokens.clear();
@ -444,6 +464,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
        llama_ctx_params.use_mlock = inputs.use_mlock;
        llama_ctx_params.n_gpu_layers = inputs.gpulayers;
        llama_ctx_params.main_gpu = cu_parseinfo_maindevice;
        llama_ctx_params.rope_freq_base = rope_freq_base;
        llama_ctx_params.rope_freq_scale = rope_freq_scale;
        llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -36,7 +36,8 @@ class load_model_inputs(ctypes.Structure):
                ("debugmode", ctypes.c_int),
                ("forceversion", ctypes.c_int),
                ("gpulayers", ctypes.c_int),
-                ("linear_rope", ctypes.c_bool),
+                ("rope_freq_scale", ctypes.c_float),
                ("rope_freq_base", ctypes.c_float),
                ("banned_tokens", ctypes.c_char_p * ban_token_max)]
 class generation_inputs(ctypes.Structure):
@ -189,7 +190,11 @@ def load_model(model_filename):
    inputs.blasbatchsize = args.blasbatchsize
    inputs.forceversion = args.forceversion
    inputs.gpulayers = args.gpulayers
-    inputs.linear_rope = args.linearrope
+    inputs.rope_freq_scale = args.ropeconfig[0]
    if len(args.ropeconfig)>1:
        inputs.rope_freq_base = args.ropeconfig[1]
    else:
        inputs.rope_freq_base = 10000
    clblastids = 0
    if args.useclblast:
        clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])
@ -1434,7 +1439,7 @@ if __name__ == '__main__':
    parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
    parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192], default=2048)
    parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512)
-    parser.add_argument("--linearrope", help="If set, uses linear RoPE scaling. Otherwise, uses NTK-Aware scaling.", action='store_true')
+    parser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
    parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
    parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
    parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents the EOS token from being generated. This flag unbans it.", action='store_true')
--- a/llama.cpp
+++ b/llama.cpp
@ -102,14 +102,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
 // memory sizes
 //
-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
+static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
 {
    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,    256ull * MB },
+        /* empirical scaling, still a guess */
-        { MODEL_7B,    512ull * MB },
+        { MODEL_3B,   ((size_t) n_ctx / 16ull + 180ull) * MB },
-        { MODEL_13B,   512ull * MB },
+        { MODEL_7B,   ((size_t) n_ctx / 16ull + 320ull) * MB },
-        { MODEL_30B,   640ull * MB },
+        { MODEL_13B,  ((size_t) n_ctx / 12ull + 460ull) * MB },
-        { MODEL_65B,  1024ull * MB },
+        { MODEL_30B,  ((size_t) n_ctx / 10ull + 620ull) * MB },
        { MODEL_65B,  ((size_t) n_ctx /  8ull + 860ull) * MB },
    };
    return k_sizes;
 }
@ -141,14 +142,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
-static const std::map<e_model, size_t> & MEM_REQ_EVAL()
+static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
 {
    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,   512ull * MB },
+        { MODEL_3B,  ((size_t) n_ctx / 256ull +  512ull) * MB },
-        { MODEL_7B,   800ull * MB },
+        { MODEL_7B,  ((size_t) n_ctx / 256ull +  800ull) * MB },
-        { MODEL_13B, 1024ull * MB },
+        { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
-        { MODEL_30B, 1380ull * MB },
+        { MODEL_30B, ((size_t) n_ctx / 256ull + 1380ull) * MB },
-        { MODEL_65B, 1536ull * MB },
+        { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
    };
    return k_sizes;
 }
@ -190,6 +191,10 @@ struct llama_hparams {
    uint32_t n_head  = 32;
    uint32_t n_layer = 32;
    uint32_t n_rot   = 64;
    float rope_freq_base  = 10000.0f;
    float rope_freq_scale = 1.0f;
    enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
    bool operator!=(const llama_hparams & other) const {
@ -648,7 +653,7 @@ struct llama_model_loader {
        *ctx_size_p = *mmapped_size_p = 0;
        for (const llama_load_tensor & lt : tensors_map.tensors) {
            *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
-            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
+            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
        }
    }
@ -844,6 +849,8 @@ struct llama_context_params llama_context_default_params() {
        /*.gpu_layers                  =*/ 0,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ {0},
        /*.rope_freq_base              =*/ 10000.0f,
        /*.rope_freq_scale             =*/ 1.0f,
        /*.progress_callback           =*/ nullptr,
        /*.progress_callback_user_data =*/ nullptr,
        /*.low_vram                    =*/ false,
@ -967,6 +974,8 @@ static void llama_model_load_internal(
        int n_gpu_layers,
        int main_gpu,
        const float * tensor_split,
        float rope_freq_base,
        float rope_freq_scale,
        bool low_vram,
        ggml_type memory_type,
        bool use_mmap,
@ -1001,22 +1010,27 @@ static void llama_model_load_internal(
        }
        hparams.n_ctx = n_ctx;
        hparams.rope_freq_base  = rope_freq_base;
        hparams.rope_freq_scale = rope_freq_scale;
    }
    const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
    {
-        fprintf(stderr, "%s: format     = %s\n",  __func__, llama_file_version_name(file_version));
+        fprintf(stderr, "%s: format     = %s\n",   __func__, llama_file_version_name(file_version));
-        fprintf(stderr, "%s: n_vocab    = %u\n",  __func__, hparams.n_vocab);
+        fprintf(stderr, "%s: n_vocab    = %u\n",   __func__, hparams.n_vocab);
-        fprintf(stderr, "%s: n_ctx      = %u\n",  __func__, hparams.n_ctx);
+        fprintf(stderr, "%s: n_ctx      = %u\n",   __func__, hparams.n_ctx);
-        fprintf(stderr, "%s: n_embd     = %u\n",  __func__, hparams.n_embd);
+        fprintf(stderr, "%s: n_embd     = %u\n",   __func__, hparams.n_embd);
-        fprintf(stderr, "%s: n_mult     = %u\n",  __func__, hparams.n_mult);
+        fprintf(stderr, "%s: n_mult     = %u\n",   __func__, hparams.n_mult);
-        fprintf(stderr, "%s: n_head     = %u\n",  __func__, hparams.n_head);
+        fprintf(stderr, "%s: n_head     = %u\n",   __func__, hparams.n_head);
-        fprintf(stderr, "%s: n_layer    = %u\n",  __func__, hparams.n_layer);
+        fprintf(stderr, "%s: n_layer    = %u\n",   __func__, hparams.n_layer);
-        fprintf(stderr, "%s: n_rot      = %u\n",  __func__, hparams.n_rot);
+        fprintf(stderr, "%s: n_rot      = %u\n",   __func__, hparams.n_rot);
        fprintf(stderr, "%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
        fprintf(stderr, "%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
        fprintf(stderr, "%s: ftype      = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
-        fprintf(stderr, "%s: n_ff       = %u\n",  __func__, n_ff);
+        fprintf(stderr, "%s: n_ff       = %u\n",   __func__, n_ff);
-        fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type));
+        fprintf(stderr, "%s: model size = %s\n",   __func__, llama_model_type_name(model.type));
    }
    if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@ -1162,13 +1176,12 @@ static void llama_model_load_internal(
        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
        // this is the total memory required to run the inference
        const size_t bigctxmul = (hparams.n_ctx>4096?3:(hparams.n_ctx>2048?2:1));
        const size_t mem_required =
            ctx_size +
            mmapped_size - vram_weights + // weights in VRAM not in memory
-            MEM_REQ_SCRATCH0().at(model.type)*bigctxmul +
+            MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
-            MEM_REQ_SCRATCH1().at(model.type)*bigctxmul +
+            MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at    (model.type);
+            MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
        // this is the memory required by one llama_state
        const size_t mem_required_state =
@ -1272,6 +1285,8 @@ static bool llama_model_load(
        int n_gpu_layers,
        int main_gpu,
        float * tensor_split,
        float rope_freq_base,
        float rope_freq_scale,
        bool low_vram,
        ggml_type memory_type,
        bool use_mmap,
@ -1280,7 +1295,7 @@ static bool llama_model_load(
        llama_progress_callback progress_callback,
        void *progress_callback_user_data) {
    try {
-        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
+        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
                                  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
        return true;
    } catch (const std::exception & err) {
@ -1332,6 +1347,9 @@ static bool llama_eval_internal(
    const int n_rot        = hparams.n_embd/hparams.n_head;
    const int n_gpu_layers = model.n_gpu_layers;
    const float freq_base  = hparams.rope_freq_base;
    const float freq_scale = hparams.rope_freq_scale;
    auto & mem_per_token = lctx.mem_per_token;
    auto & buf_compute   = lctx.buf_compute;
@ -1429,11 +1447,11 @@ static bool llama_eval_internal(
            offload_func_kq(tmpq);
            ggml_set_name(tmpq, "tmpq");
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
            offload_func_kq(Kcur);
            ggml_set_name(Kcur, "Kcur");
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
            offload_func_kq(Qcur);
            ggml_set_name(Qcur, "Qcur");
@ -2007,9 +2025,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
    }
    // Normalize the second derivatives
-    float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
+    {
-    for (float & value : second_derivatives) {
+        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
-        value /= second_derivatives_sum;
+
        if (second_derivatives_sum > 1e-6f) {
            for (float & value : second_derivatives) {
                value /= second_derivatives_sum;
            }
        } else {
            for (float & value : second_derivatives) {
                value = 1.0f / second_derivatives.size();
            }
        }
    }
    float cum_sum = 0.0f;
@ -2188,7 +2215,7 @@ void llama_sample_classifier_free_guidance(
          struct llama_context * guidance_ctx,
                         float   scale,
                         float   smooth_factor) {
-    int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
+    int64_t t_start_sample_us = ggml_time_us();
    assert(ctx);
    auto n_vocab = llama_n_vocab(ctx);
@ -2676,8 +2703,9 @@ struct llama_model * llama_load_model_from_file(
    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
-                params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
+                params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
-                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
+                memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
                params.progress_callback_user_data)) {
        delete model;
        fprintf(stderr, "%s: failed to load model\n", __func__);
        return nullptr;
@ -2752,11 +2780,10 @@ struct llama_context * llama_new_context_with_model(
            ctx->embedding.resize(hparams.n_embd);
        }
-        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+        ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
-        const size_t bigctxmul = (hparams.n_ctx>4096?3:(hparams.n_ctx>2048?2:1));
+        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
-        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)*bigctxmul);
+        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)*bigctxmul);
    }
 #ifdef GGML_USE_METAL
--- a/llama.h
+++ b/llama.h
@ -89,6 +89,11 @@ extern "C" {
        int32_t  n_gpu_layers;                 // number of layers to store in VRAM
        int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;  // RoPE base frequency
        float    rope_freq_scale; // RoPE frequency scaling factor
        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;
        // context pointer passed to the progress callback
--- a/otherarch/gptj_v3.cpp
+++ b/otherarch/gptj_v3.cpp
@ -409,6 +409,9 @@ bool gptj_eval(
    const int n_vocab = hparams.n_vocab;
    const int n_rot   = hparams.n_rot;
    const float freq_base  = hparams.rope_freq_base;
    const float freq_scale = hparams.rope_freq_scale;
    static size_t buf_size = 256u*1024*1024;
    static void * buf = malloc(buf_size);
@ -475,8 +478,8 @@ bool gptj_eval(
        // self-attention
        {
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, n_ctx);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, n_ctx);
            // store key and value to memory
            {
--- a/otherarch/neox_v3.cpp
+++ b/otherarch/neox_v3.cpp
@ -425,6 +425,9 @@ bool gpt_neox_eval(
    const int n_vocab = hparams.n_vocab;
    const int n_rot   = hparams.n_rot;
    const float freq_base  = hparams.rope_freq_base;
    const float freq_scale = hparams.rope_freq_scale;
    static size_t buf_size = 256u*1024*1024;
    static void * buf = malloc(buf_size);
@ -503,8 +506,8 @@ bool gpt_neox_eval(
            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
            // using mode = 2 for GPT-NeoX mode
-            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, n_ctx);
+            Qcur = ggml_rope_custom_inplace(ctx0, Qcur, n_past, n_rot, 2, freq_base, freq_scale, n_ctx);
-            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, n_ctx);
+            Kcur = ggml_rope_custom_inplace(ctx0, Kcur, n_past, n_rot, 2, freq_base, freq_scale, n_ctx);
            // store key and value to memory
            {
--- a/otherarch/otherarch.h
+++ b/otherarch/otherarch.h
@ -24,6 +24,9 @@ struct gptj_hparams {
    int32_t n_layer = 28;
    int32_t n_rot   = 64;
    int32_t ftype   = 1;
    float rope_freq_base  = 10000.0f;
    float rope_freq_scale = 1.0f;
 };
 struct gptj_layer {
@ -309,6 +312,9 @@ struct gpt_neox_hparams {
    int32_t n_rot   = 32; // rotary_pct * (n_embd / n_head)
    int32_t par_res = 1; // 1 = true, 0 = false
    int32_t ftype   = 1;
    float rope_freq_base  = 10000.0f;
    float rope_freq_scale = 1.0f;
 };
 struct gpt_neox_layer_v2 {