From d5a410e8556191672465f7ff58682ea2474038b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 7 Jan 2024 17:24:08 +0100 Subject: [PATCH 1/4] CUDA: fixed redundant value dequantization (#4809) --- ggml-cuda.cu | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 54b266be4..2df64b111 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1872,14 +1872,6 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, v.y = x[ib + iqs + 1]; } -static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const float * x = (const float *) vx; - - // automatic half -> float type cast if dfloat == float - v.x = x[ib + iqs + 0]; - v.y = x[ib + iqs + 1]; -} - static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) { const int ix = blockDim.x*blockIdx.x + threadIdx.x; @@ -1983,7 +1975,7 @@ static __global__ void k_get_rows_float( template static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { - const int i = blockDim.x*blockIdx.x + 2*threadIdx.x; + const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x); if (i >= k) { return; @@ -2002,6 +1994,19 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ y[iybs + iqs + y_offset] = v.y; } +template +static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + const src_t * x = (src_t *) vx; + + y[i] = x[i]; +} + // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q @@ -5609,7 +5614,7 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con template static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE); dequantize_block<<>>(vx, y, k); } @@ -5659,6 +5664,12 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu #endif } +template +static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + convert_unary<<>>(vx, y, k); +} + static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: @@ -5682,7 +5693,7 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { case GGML_TYPE_Q6_K: return dequantize_row_q6_K_cuda; case GGML_TYPE_F32: - return dequantize_block_cuda<1, 1, convert_f32>; + return convert_unary_cuda; default: return nullptr; } @@ -5711,7 +5722,7 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { case GGML_TYPE_Q6_K: return dequantize_row_q6_K_cuda; case GGML_TYPE_F16: - return dequantize_block_cuda<1, 1, convert_f16>; + return convert_unary_cuda; default: return nullptr; } From 226460cc0d5b185bc6685fb76f418fd9418d7add Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 7 Jan 2024 17:59:01 +0100 Subject: [PATCH 2/4] llama-bench : add no-kv-offload parameter (#4812) --- examples/llama-bench/llama-bench.cpp | 34 +++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 6617c050d..7f7186cde 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -138,6 +138,7 @@ struct cmd_params { std::vector n_threads; std::vector n_gpu_layers; std::vector main_gpu; + std::vector no_kv_offload; std::vector mul_mat_q; std::vector> tensor_split; int reps; @@ -155,6 +156,7 @@ static const cmd_params cmd_params_defaults = { /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, /* main_gpu */ {0}, + /* no_kv_offload */ {false}, /* mul_mat_q */ {true}, /* tensor_split */ {{}}, /* reps */ 5, @@ -176,6 +178,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); printf(" -ts, --tensor_split \n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); @@ -309,6 +312,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.main_gpu = split(argv[i], split_delim); + } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); } else if (arg == "-mmq" || arg == "--mul-mat-q") { if (++i >= argc) { invalid_param = true; @@ -383,6 +393,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } + if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } @@ -400,6 +411,7 @@ struct cmd_params_instance { int n_threads; int n_gpu_layers; int main_gpu; + bool no_kv_offload; bool mul_mat_q; std::array tensor_split; @@ -428,6 +440,7 @@ struct cmd_params_instance { cparams.type_k = type_k; cparams.type_v = type_v; cparams.mul_mat_q = mul_mat_q; + cparams.offload_kqv = !no_kv_offload; return cparams; } @@ -444,6 +457,7 @@ static std::vector get_cmd_params_instances_int(const cmd_p for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) for (const auto & mmq : params.mul_mat_q) + for (const auto & nkvo : params.no_kv_offload) for (const auto & nt : params.n_threads) { cmd_params_instance instance = { /* .model = */ m, @@ -455,6 +469,7 @@ static std::vector get_cmd_params_instances_int(const cmd_p /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, }; @@ -476,6 +491,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) for (const auto & mmq : params.mul_mat_q) + for (const auto & nkvo : params.no_kv_offload) for (const auto & nt : params.n_threads) { for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { @@ -491,6 +507,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, }; @@ -511,6 +528,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, }; @@ -559,6 +577,7 @@ struct test { ggml_type type_v; int n_gpu_layers; int main_gpu; + bool no_kv_offload; bool mul_mat_q; std::array tensor_split; int n_prompt; @@ -579,6 +598,7 @@ struct test { type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; main_gpu = inst.main_gpu; + no_kv_offload = inst.no_kv_offload; mul_mat_q = inst.mul_mat_q; tensor_split = inst.tensor_split; n_prompt = inst.n_prompt; @@ -640,7 +660,8 @@ struct test { "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "type_k", "type_v", - "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split", + "n_gpu_layers", "main_gpu", "no_kv_offload", + "mul_mat_q", "tensor_split", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" @@ -659,7 +680,7 @@ struct test { return INT; } if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" || - field == "f16_kv" || field == "mul_mat_q") { + field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -690,7 +711,8 @@ struct test { cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), - std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str, + std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(no_kv_offload), + std::to_string(mul_mat_q), tensor_split_str, std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ts()), std::to_string(stdev_ts()) @@ -851,6 +873,9 @@ struct markdown_printer : public printer { if (field == "mul_mat_q") { return "mmq"; } + if (field == "no_kv_offload") { + return "nkvo"; + } if (field == "tensor_split") { return "ts"; } @@ -885,6 +910,9 @@ struct markdown_printer : public printer { if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { fields.push_back("mul_mat_q"); } + if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { + fields.push_back("no_kv_offload"); + } if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { fields.push_back("tensor_split"); } From b7e7982953f80a656e03feb5cfb17a17a173eb26 Mon Sep 17 00:00:00 2001 From: Lars Grammel Date: Sun, 7 Jan 2024 21:24:11 +0100 Subject: [PATCH 3/4] readme : add lgrammel/modelfusion JS/TS client for llama.cpp (#4814) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ca6d14e17..2f6e6ffee 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,7 @@ as the main playground for developing new features for the [ggml](https://github - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) +- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) From b0034d93ce2949ce7d9c098ca02e56f66cd484e2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 8 Jan 2024 11:14:04 +0200 Subject: [PATCH 4/4] examples : add passkey test (#3856) * examples : add passkey test * passkey : better prints * passkey : select pass key pos from CLI * passkey : simplify n_past logic * make : add passkey target * passkey : add "self-extend"-like context extension (#4810) * llama : "self-extend"-like context extension * passkey : add comment * passkey : add readme --- .gitignore | 1 + Makefile | 5 +- examples/CMakeLists.txt | 1 + examples/batched/batched.cpp | 1 + examples/passkey/CMakeLists.txt | 5 + examples/passkey/README.md | 12 ++ examples/passkey/passkey.cpp | 296 ++++++++++++++++++++++++++++++++ llama.cpp | 34 ++++ llama.h | 7 + 9 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 examples/passkey/CMakeLists.txt create mode 100644 examples/passkey/README.md create mode 100644 examples/passkey/passkey.cpp diff --git a/.gitignore b/.gitignore index def74a1e9..cf1b692e9 100644 --- a/.gitignore +++ b/.gitignore @@ -51,6 +51,7 @@ models-mnt /lookup /main /metal +/passkey /perplexity /q8dot /quantize diff --git a/Makefile b/Makefile index 28c6d79bc..4c7e175bf 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ BUILD_TARGETS = \ main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ - speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o + speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o # Binaries only useful for tests TEST_TARGETS = \ @@ -665,6 +665,9 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + ifdef LLAMA_METAL metal: examples/metal/metal.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4cc13d6e9..0c71cbdf7 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -31,6 +31,7 @@ else() add_subdirectory(quantize-stats) add_subdirectory(save-load-state) add_subdirectory(simple) + add_subdirectory(passkey) add_subdirectory(speculative) add_subdirectory(lookahead) add_subdirectory(lookup) diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 22a4265df..b1775e0b0 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -69,6 +69,7 @@ int main(int argc, char ** argv) { std::vector tokens_list; tokens_list = ::llama_tokenize(model, params.prompt, true); + const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel; // initialize the context diff --git a/examples/passkey/CMakeLists.txt b/examples/passkey/CMakeLists.txt new file mode 100644 index 000000000..3161bf3ef --- /dev/null +++ b/examples/passkey/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET passkey) +add_executable(${TARGET} passkey.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/passkey/README.md b/examples/passkey/README.md new file mode 100644 index 000000000..4a22bb559 --- /dev/null +++ b/examples/passkey/README.md @@ -0,0 +1,12 @@ +# llama.cpp/example/passkey + +See the following PRs for more info: + +- https://github.com/ggerganov/llama.cpp/pull/3856 +- https://github.com/ggerganov/llama.cpp/pull/4810 + +### Usage + +```bash +make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250 +``` diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp new file mode 100644 index 000000000..5c0022832 --- /dev/null +++ b/examples/passkey/passkey.cpp @@ -0,0 +1,296 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include + +int main(int argc, char ** argv) { + gpt_params params; + + if (argc == 1 || argv[1][0] == '-') { + printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]); + return 1 ; + } + + int seed = -1; + + int n_junk = 250; // number of times to repeat the junk text + int n_keep = 32; // number of tokens in the prompt prefix + int n_grp = 1; // if more than 1 - perform LongLM SelfExtend + int i_pos = -1; // position of the passkey in the junk text + + if (argc >= 2) { + params.model = argv[1]; + } + + if (argc >= 3) { + n_junk = std::stoi(argv[2]); + } + + if (argc >= 4) { + n_grp = std::stoi(argv[3]); + } + + if (argc >= 5) { + i_pos = std::stoi(argv[4]); + } + + if (argc >= 6) { + seed = std::stoi(argv[5]); + } + + if (seed == -1) { + seed = time(NULL); + } + + srand(seed); + + if (i_pos == -1) { + i_pos = rand() % n_junk; + } + + const std::string prompt_prefix = "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there."; + const std::string prompt_suffix = " What is the pass key? The pass key is"; + + // generate junk text + params.prompt = prompt_prefix; + + const int passkey = rand() % 50000 + 1; + + for (int i = 0; i < n_junk; i++) { + if (i % n_junk == i_pos) { + params.prompt += " The pass key is " + std::to_string(passkey) + ". Remember it. " + std::to_string(passkey) + " is the pass key."; + } + + params.prompt += " The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again."; + } + + params.prompt += prompt_suffix; + + // init LLM + + llama_backend_init(params.numa); + + // initialize the model + + llama_model_params model_params = llama_model_default_params(); + + model_params.n_gpu_layers = 99; // offload all layers to the GPU + + llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + + if (model == NULL) { + fprintf(stderr , "%s: error: unable to load model\n" , __func__); + return 1; + } + + // initialize the context + + llama_context_params ctx_params = llama_context_default_params(); + + ctx_params.seed = seed; + ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; + ctx_params.n_batch = 512; + ctx_params.n_threads = params.n_threads; + ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + + GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp"); + + llama_context * ctx = llama_new_context_with_model(model, ctx_params); + + if (ctx == NULL) { + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + return 1; + } + + // tokenize the prompt + std::vector tokens_list; + tokens_list = ::llama_tokenize(ctx, params.prompt, true); + + // tokenize the prefix and use it as a sink + const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size(); + + const int n_tokens_all = tokens_list.size(); + + // we leave a margin of 16 tokens for the generated text - it should contain just the passkey + const int n_predict = 16; + + // total length of the sequences including the prompt + const int n_len = n_tokens_all + n_predict; + + const int n_ctx = llama_n_ctx(ctx) - n_keep; + const int n_kv_req = llama_n_ctx(ctx); + const int n_batch = ctx_params.n_batch; + const int n_batch_grp = ctx_params.n_batch/n_grp; + + LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch); + + // print the prompt token-by-token + + LOG_TEE("\n"); + LOG_TEE("prefix tokens: %d\n", n_tokens_prefix); + LOG_TEE("prompt tokens: %d\n", n_tokens_all); + //LOG_TEE("prompt: %s\n", params.prompt.c_str()); + + llama_batch batch = llama_batch_init(512, 0, 1); + + int n_past = 0; + + // fill the KV cache + for (int i = 0; i < n_ctx; i += n_batch) { + if (i > 0 && n_grp > 1) { + // if SelfExtend is enabled, we compress the position from the last batch by a factor of n_grp + const int ib = i/n_batch - 1; + const int bd = n_batch_grp*(n_grp - 1); + + llama_kv_cache_seq_shift(ctx, 0, n_past - n_batch, n_past, ib*bd); + llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + + n_past -= bd; + } + + llama_batch_clear(batch); + + for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) { + llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); + } + + if (i + n_batch >= n_tokens_all) { + batch.logits[batch.n_tokens - 1] = true; + } + + if (llama_decode(ctx, batch) != 0) { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return 1; + } + + LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); + + if (i + n_batch >= n_tokens_all) { + break; + } + } + + for (int i = n_ctx; i < n_tokens_all; i += n_batch) { + const int n_discard = n_batch; + + LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard); + + llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + + n_past -= n_discard; + + llama_batch_clear(batch); + + for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) { + llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); + } + + if (i + n_batch >= n_tokens_all) { + batch.logits[batch.n_tokens - 1] = true; + } + + if (llama_decode(ctx, batch) != 0) { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return 1; + } + + LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); + } + + { + const int n_discard = n_past - n_ctx + n_predict; + + if (n_discard > 0) { + LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); + + llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + + n_past -= n_discard; + } + } + + LOG_TEE("\n"); + LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk); + LOG_TEE("\n"); + + // main loop + + int n_cur = n_tokens_all; + int n_decode = 0; + + LOG_TEE("%s", prompt_suffix.c_str()); + fflush(stdout); + + const auto t_main_start = ggml_time_us(); + + while (n_cur <= n_len) { + // sample the next token + { + auto n_vocab = llama_n_vocab(model); + auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1); + + std::vector candidates; + candidates.reserve(n_vocab); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // sample the most likely token + const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); + + // is it an end of stream? + if (new_token_id == llama_token_eos(model) || n_cur == n_len) { + LOG_TEE("\n"); + + break; + } + + LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + fflush(stdout); + + n_decode += 1; + + // prepare the next batch + llama_batch_clear(batch); + + // push this new token for next evaluation + llama_batch_add(batch, new_token_id, n_past++, { 0 }, true); + } + + n_cur += 1; + + // evaluate the current batch with the transformer model + if (llama_decode(ctx, batch)) { + fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + return 1; + } + } + + LOG_TEE("\n"); + + const auto t_main_end = ggml_time_us(); + + LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); + + llama_print_timings(ctx); + + fprintf(stderr, "\n"); + + llama_batch_free(batch); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + return 0; +} diff --git a/llama.cpp b/llama.cpp index 91aa3f8e7..63853d1c3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1903,6 +1903,28 @@ static void llama_kv_cache_seq_shift( cache.head = new_head != cache.size ? new_head : 0; } +static void llama_kv_cache_seq_div( + struct llama_kv_cache & cache, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + if (p0 < 0) p0 = 0; + if (p1 < 0) p1 = std::numeric_limits::max(); + + for (uint32_t i = 0; i < cache.size; ++i) { + if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { + cache.has_shift = true; + + { + llama_pos p_old = cache.cells[i].pos; + cache.cells[i].pos /= d; + cache.cells[i].delta += cache.cells[i].pos - p_old; + } + } + } +} + // // model loading and saving // @@ -10140,9 +10162,21 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) { } void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { + if (delta == 0) { + return; + } + llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta); } +void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + if (d == 1) { + return; + } + + llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d); +} + // Returns the *maximum* size of the state size_t llama_get_state_size(const struct llama_context * ctx) { // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. diff --git a/llama.h b/llama.h index 461d4604a..5305de90b 100644 --- a/llama.h +++ b/llama.h @@ -484,6 +484,13 @@ extern "C" { llama_pos p1, llama_pos delta); + LLAMA_API void llama_kv_cache_seq_div( + struct llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d); + // // State / sessions //