diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 8c6312508..04e3fc0c1 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -3,6 +3,11 @@ name: Server on: workflow_dispatch: # allows manual triggering + inputs: + slow_tests: + description: 'Run slow tests' + required: true + type: boolean push: branches: - master @@ -11,7 +16,7 @@ on: types: [opened, synchronize, reopened] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*'] schedule: - - cron: '00 0 * * *' + - cron: '0 0 * * *' jobs: server: @@ -80,7 +85,7 @@ jobs: - name: Slow tests id: server_integration_tests_slow - if: github.event.schedule != '' + if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }} run: | cd examples/server/tests PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow diff --git a/README.md b/README.md index 939646753..45c5d06f3 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,10 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ +### Recent API changes + +- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849 + ### Hot topics - The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761)) diff --git a/common/common.cpp b/common/common.cpp index 1c0b7c403..dbe7e9229 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -335,6 +335,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.yarn_beta_slow = std::stof(argv[i]); + } else if (arg == "--pooling") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else { invalid_param = true; break; } } else if (arg == "--defrag-thold" || arg == "-dt") { if (++i >= argc) { invalid_param = true; @@ -1014,6 +1024,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); + printf(" --pooling {none,mean,cls}\n"); + printf(" pooling type for embeddings, use model default if unspecified\n"); printf(" -dt N, --defrag-thold N\n"); printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); @@ -1296,6 +1308,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.yarn_orig_ctx = params.yarn_orig_ctx; + cparams.pooling_type = params.pooling_type; cparams.defrag_thold = params.defrag_thold; cparams.offload_kqv = !params.no_kv_offload; diff --git a/common/common.h b/common/common.h index ab62bdb82..d3682b7ad 100644 --- a/common/common.h +++ b/common/common.h @@ -76,8 +76,11 @@ struct gpt_params { float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = -1.0f; // KV cache defragmentation threshold - int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; - ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; + + ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; + + llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; + llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings // // sampling parameters struct llama_sampling_params sparams; diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index fa9d4f22f..ffdba7444 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1644,16 +1644,17 @@ class BertModel(Model): self.gguf_writer.add_causal_attention(False) # get pooling path - with open(self.dir_model / "modules.json", encoding="utf-8") as f: - modules = json.load(f) pooling_path = None - for mod in modules: - if mod["type"] == "sentence_transformers.models.Pooling": - pooling_path = mod["path"] - break + module_path = self.dir_model / "modules.json" + if module_path.is_file(): + with open(module_path, encoding="utf-8") as f: + modules = json.load(f) + for mod in modules: + if mod["type"] == "sentence_transformers.models.Pooling": + pooling_path = mod["path"] + break # get pooling type - pooling_type = gguf.PoolingType.NONE if pooling_path is not None: with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: pooling = json.load(f) @@ -1663,8 +1664,7 @@ class BertModel(Model): pooling_type = gguf.PoolingType.CLS else: raise NotImplementedError("Only MEAN and CLS pooling types supported") - - self.gguf_writer.add_pooling_type(pooling_type) + self.gguf_writer.add_pooling_type(pooling_type) def set_vocab(self): path = self.dir_model diff --git a/examples/server/README.md b/examples/server/README.md index 9f3a85bce..87ab71068 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -18,7 +18,7 @@ The project is under active development, and we are [looking for feedback and co - `--threads N`, `-t N`: Set the number of threads to use during generation. - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. -- `--threads-http N`: number of threads in the http server pool to process requests (default: `std::thread::hardware_concurrency()`) +- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`) - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6009eaaeb..16d60b968 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2027,7 +2027,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n"); - printf(" --threads-http N number of threads in the http server pool to process requests (default: hardware concurrency)\n"); + printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n"); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); printf(" --rope-scaling {none,linear,yarn}\n"); printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); @@ -3494,10 +3494,12 @@ int main(int argc, char **argv) }*/ //); - if (sparams.n_threads_http > 0) { - log_data["n_threads_http"] = std::to_string(sparams.n_threads_http); - svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); }; + if (sparams.n_threads_http < 1) { + // +2 threads for monitoring endpoints + sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); } + log_data["n_threads_http"] = std::to_string(sparams.n_threads_http); + svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); }; LOG_INFO("HTTP server listening", log_data); // run the HTTP server in a thread - see comment below diff --git a/flake.lock b/flake.lock index 9f659ba8f..b1b091656 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1706830856, - "narHash": "sha256-a0NYyp+h9hlb7ddVz4LUn1vT/PLwqfrWYcHMvFB1xYg=", + "lastModified": 1709336216, + "narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "b253292d9c0a5ead9bc98c4e9a26c6312e27d69f", + "rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1708655239, - "narHash": "sha256-ZrP/yACUvDB+zbqYJsln4iwotbH6CTZiTkANJ0AgDv4=", + "lastModified": 1709237383, + "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "cbc4211f0afffe6dfd2478a62615dd5175a13f9a", + "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8", "type": "github" }, "original": { @@ -37,11 +37,11 @@ "nixpkgs-lib": { "locked": { "dir": "lib", - "lastModified": 1706550542, - "narHash": "sha256-UcsnCG6wx++23yeER4Hg18CXWbgNpqNXcHIo5/1Y+hc=", + "lastModified": 1709237383, + "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "97b17f32362e475016f942bbdfda4a4a72a8a652", + "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8", "type": "github" }, "original": { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7ed97430f..04c6cb1b8 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6904,6 +6904,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f // find the sum of exps in the block tmp = warp_reduce_sum(tmp); if (block_size > WARP_SIZE) { + __syncthreads(); if (warp_id == 0) { buf_iw[lane_id] = 0.0f; } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5db760cb1..a62139811 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -604,20 +604,28 @@ class PoolingType(IntEnum): class GGMLQuantizationType(IntEnum): - F32 = 0 - F16 = 1 - Q4_0 = 2 - Q4_1 = 3 - Q5_0 = 6 - Q5_1 = 7 - Q8_0 = 8 - Q8_1 = 9 - Q2_K = 10 - Q3_K = 11 - Q4_K = 12 - Q5_K = 13 - Q6_K = 14 - Q8_K = 15 + F32 = 0 + F16 = 1 + Q4_0 = 2 + Q4_1 = 3 + Q5_0 = 6 + Q5_1 = 7 + Q8_0 = 8 + Q8_1 = 9 + Q2_K = 10 + Q3_K = 11 + Q4_K = 12 + Q5_K = 13 + Q6_K = 14 + Q8_K = 15 + IQ2_XXS = 16 + IQ2_XS = 17 + IQ3_XXS = 18 + IQ1_S = 19 + IQ4_NL = 20 + IQ3_S = 21 + IQ2_S = 22 + IQ4_XS = 23 class GGUFEndian(IntEnum): @@ -662,20 +670,28 @@ class GGUFValueType(IntEnum): QK_K = 256 # Items here are (block size, type size) GGML_QUANT_SIZES = { - GGMLQuantizationType.F32: (1, 4), - GGMLQuantizationType.F16: (1, 2), - GGMLQuantizationType.Q4_0: (32, 2 + 16), - GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), - GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), - GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), - GGMLQuantizationType.Q8_0: (32, 2 + 32), - GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), - GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), - GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), - GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), - GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), - GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), + GGMLQuantizationType.F32: (1, 4), + GGMLQuantizationType.F16: (1, 2), + GGMLQuantizationType.Q4_0: (32, 2 + 16), + GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), + GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), + GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), + GGMLQuantizationType.Q8_0: (32, 2 + 32), + GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), + GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), + GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), + GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), + GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), + GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), + GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4), + GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), + GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8), + GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), + GGMLQuantizationType.IQ4_NL: (32, 2 + 16), + GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), + GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), } diff --git a/llama.cpp b/llama.cpp index d4c7a965b..c1f015791 100644 --- a/llama.cpp +++ b/llama.cpp @@ -873,16 +873,16 @@ struct LLM_TN { // gguf helpers // -static const std::map LLAMA_ROPE_SCALING_TYPES = { +static const std::map LLAMA_ROPE_SCALING_TYPES = { { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, }; -static int32_t llama_rope_scaling_type_from_string(const std::string & name) { +static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) { for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) { if (kv.second == name) { - return kv.first; + return (llama_rope_scaling_type) kv.first; } } @@ -1612,7 +1612,6 @@ struct llama_hparams { float rope_freq_base_train; float rope_freq_scale_train; uint32_t n_yarn_orig_ctx; - int32_t rope_scaling_type_train; float f_clamp_kqv = 0.0f; float f_max_alibi_bias = 0.0f; @@ -1620,8 +1619,9 @@ struct llama_hparams { bool causal_attn = true; bool need_kq_pos = false; - enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; - enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; + enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; + enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; + enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE; bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; @@ -1670,8 +1670,8 @@ struct llama_cparams { uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing - float rope_freq_base; - float rope_freq_scale; + float rope_freq_base; + float rope_freq_scale; uint32_t n_yarn_orig_ctx; // These hyperparameters are not exposed in GGUF, because all @@ -1683,7 +1683,7 @@ struct llama_cparams { float defrag_thold; bool offload_kqv; - bool do_pooling; + enum llama_pooling_type pooling_type; ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; @@ -2156,10 +2156,12 @@ static bool llama_kv_cache_find_slot( } // find how many cells are currently in use -static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { - for (uint32_t i = cache.size - 1; i > 0; --i) { - if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) { - return i + 1; +static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { + for (uint32_t i = cache.size; i > 0; --i) { + const llama_kv_cell & cell = cache.cells[i - 1]; + + if (cell.pos >= 0 && !cell.is_empty()) { + return i; } } @@ -2931,7 +2933,11 @@ template<> bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) { uint32_t tmp; const bool found = get_key(kid, tmp, required); - result = (enum llama_pooling_type) tmp; + if (found) { + result = (enum llama_pooling_type) tmp; + } else { + result = LLAMA_POOLING_TYPE_UNSPECIFIED; + } return found; } @@ -3208,7 +3214,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); switch (hparams.n_layer) { case 3: @@ -5173,7 +5179,7 @@ struct llm_build_context { n_kv (worst_case ? n_ctx : kv_self.n), kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), - pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE), + pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { @@ -8013,7 +8019,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { + if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); @@ -8041,7 +8047,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) { + if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); @@ -8178,7 +8184,7 @@ static int llama_decode_internal( // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); + kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); //kv_self.n = llama_kv_cache_cell_max(kv_self); //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); @@ -11844,6 +11850,7 @@ struct llama_context_params llama_context_default_params() { /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, + /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, /*.yarn_ext_factor =*/ -1.0f, @@ -11859,7 +11866,6 @@ struct llama_context_params llama_context_default_params() { /*.logits_all =*/ false, /*.embedding =*/ false, /*.offload_kqv =*/ true, - /*.do_pooling =*/ true, /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, }; @@ -12010,7 +12016,7 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.defrag_thold = params.defrag_thold; cparams.offload_kqv = params.offload_kqv; - cparams.do_pooling = params.do_pooling; + cparams.pooling_type = params.pooling_type; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -12036,6 +12042,14 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; } + if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { + if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { + cparams.pooling_type = LLAMA_POOLING_TYPE_NONE; + } else { + cparams.pooling_type = hparams.pooling_type; + } + } + if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); } @@ -12615,9 +12629,14 @@ size_t llama_get_state_size(const struct llama_context * ctx) { const size_t s_logits = ctx->logits.capacity() * sizeof(float); const size_t s_embedding_size = sizeof(size_t); const size_t s_embedding = ctx->embedding.size() * sizeof(float); - const size_t s_kv_size = sizeof(size_t); - const size_t s_kv_ntok = sizeof(int); + const size_t s_kv_buf_size = sizeof(size_t); + const size_t s_kv_head = sizeof(uint32_t); + const size_t s_kv_size = sizeof(uint32_t); + const size_t s_kv_used = sizeof(uint32_t); const size_t s_kv = ctx->kv_self.total_size(); + // TODO: assume the max is more than 1 seq_id per KV cell + const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id); + const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell; const size_t s_total = ( + s_rng_size @@ -12626,9 +12645,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) { + s_logits + s_embedding_size + s_embedding + + s_kv_buf_size + + s_kv_head + s_kv_size - + s_kv_ntok + + s_kv_used + s_kv + + s_kv_cells ); return s_total; @@ -12728,15 +12750,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat { const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; - const auto & cparams = ctx->cparams; const uint32_t n_layer = hparams.n_layer; const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - const uint32_t n_ctx = cparams.n_ctx; const size_t kv_buf_size = kv_self.total_size(); - const uint32_t kv_head = kv_self.head; + const uint32_t kv_head = llama_kv_cache_cell_max(kv_self); const uint32_t kv_size = kv_self.size; const uint32_t kv_used = kv_self.used; @@ -12756,7 +12776,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat // v is not contiguous, copy row by row const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size); tmp_buf.resize(v_row_size); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { @@ -12766,7 +12786,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat } } - for (uint32_t i = 0; i < kv_size; ++i) { + for (uint32_t i = 0; i < kv_head; ++i) { const auto & cell = kv_self.cells[i]; const llama_pos pos = cell.pos; @@ -12842,12 +12862,10 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { { const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; - const auto & cparams = ctx->cparams; const uint32_t n_layer = hparams.n_layer; const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - const uint32_t n_ctx = cparams.n_ctx; size_t kv_buf_size; uint32_t kv_head; @@ -12870,7 +12888,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { // v is not contiguous, copy row by row const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx); + const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size); @@ -12879,13 +12897,15 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { } } + GGML_ASSERT(kv_self.size == kv_size); + ctx->kv_self.head = kv_head; ctx->kv_self.size = kv_size; ctx->kv_self.used = kv_used; ctx->kv_self.cells.resize(kv_size); - for (uint32_t i = 0; i < kv_size; ++i) { + for (uint32_t i = 0; i < kv_head; ++i) { llama_pos pos; size_t seq_id_size; @@ -12901,6 +12921,11 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { ctx->kv_self.cells[i].seq_id.insert(seq_id); } } + + for (uint32_t i = kv_head; i < kv_size; ++i) { + ctx->kv_self.cells[i].pos = -1; + ctx->kv_self.cells[i].seq_id.clear(); + } } const size_t nread = inp - src; diff --git a/llama.h b/llama.h index 6406b5270..70da4cb3f 100644 --- a/llama.h +++ b/llama.h @@ -129,6 +129,7 @@ extern "C" { }; enum llama_pooling_type { + LLAMA_POOLING_TYPE_UNSPECIFIED = -1, LLAMA_POOLING_TYPE_NONE = 0, LLAMA_POOLING_TYPE_MEAN = 1, LLAMA_POOLING_TYPE_CLS = 2, @@ -236,7 +237,10 @@ extern "C" { uint32_t n_batch; // prompt processing maximum batch size uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing - int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` + + enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` + enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id + // (ignored if no pooling layer) // ref: https://github.com/ggerganov/llama.cpp/pull/2054 float rope_freq_base; // RoPE base frequency, 0 = from model @@ -258,7 +262,6 @@ extern "C" { bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embedding; // embedding mode only bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU - bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) // Abort callback // if it returns true, execution of llama_decode() will be aborted