diff --git a/README.md b/README.md index 98fdc6808..54bf84bec 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Recent API changes -- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328 +- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796 - [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849 diff --git a/common/common.cpp b/common/common.cpp index 16ef4d7f7..4e317f9c9 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1786,17 +1786,17 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", - view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); + view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); llama_kv_cache_view_cell * c_curr = view.cells; llama_seq_id * cs_curr = view.cells_sequences; - for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) { if (i % row_size == 0) { printf("\n%5d: ", i); } int seq_count = 0; - for (int j = 0; j < view.n_max_seq; j++) { + for (int j = 0; j < view.n_seq_max; j++) { if (cs_curr[j] >= 0) { seq_count++; } } putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]); @@ -1809,14 +1809,14 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", - view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); + view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); std::unordered_map seqs; llama_kv_cache_view_cell * c_curr = view.cells; llama_seq_id * cs_curr = view.cells_sequences; - for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { - for (int j = 0; j < view.n_max_seq; j++) { + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) { + for (int j = 0; j < view.n_seq_max; j++) { if (cs_curr[j] < 0) { continue; } if (seqs.find(cs_curr[j]) == seqs.end()) { if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } @@ -1835,11 +1835,11 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { c_curr = view.cells; cs_curr = view.cells_sequences; - for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) { if (i % row_size == 0) { printf("\n%5d: ", i); } - for (int j = 0; j < view.n_max_seq; j++) { + for (int j = 0; j < view.n_seq_max; j++) { if (cs_curr[j] >= 0) { const auto & it = seqs.find(cs_curr[j]); putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+'); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 47059e582..e2d07a631 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -878,6 +878,7 @@ int main(int argc, char ** argv) { const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); + LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end()); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 293eb52c3..fdfc8f5dc 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -841,7 +841,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { const int n_batch = params.n_batch; const int max_tasks_per_batch = 32; - const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx)); + const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); @@ -1118,7 +1118,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { const int n_batch = params.n_batch; const int max_tasks_per_batch = 128; - const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_max_seq(ctx)); + const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); @@ -1470,7 +1470,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params const int n_batch = params.n_batch; const int max_tasks_per_batch = 32; - const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx)); + const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); diff --git a/llama.cpp b/llama.cpp index f61ea791b..44c73df5c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13037,7 +13037,7 @@ uint32_t llama_n_batch(const struct llama_context * ctx) { return ctx->cparams.n_batch; } -uint32_t llama_n_max_seq(const struct llama_context * ctx) { +uint32_t llama_n_seq_max(const struct llama_context * ctx) { return ctx->kv_self.size; } @@ -13201,10 +13201,10 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const } } -struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) { +struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) { struct llama_kv_cache_view result = { /*.n_cells = */ 0, - /*.n_max_seq = */ n_max_seq, + /*.n_seq_max = */ n_seq_max, /*.token_count = */ 0, /*.used_cells = */ llama_get_kv_cache_used_cells(ctx), /*.max_contiguous = */ 0, @@ -13232,7 +13232,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells); GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells"); view->cells = (struct llama_kv_cache_view_cell *)p; - p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells); + p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells); GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences"); view->cells_sequences = (llama_seq_id *)p; } @@ -13246,7 +13246,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k uint32_t max_contig = 0; int32_t max_contig_idx = -1; - for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) { + for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_seq_max) { const size_t curr_size = kv_cells[i].seq_id.size(); token_count += curr_size; c_curr->pos = kv_cells[i].pos + kv_cells[i].delta; @@ -13263,7 +13263,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k int seq_idx = 0; for (const llama_seq_id it : kv_cells[i].seq_id) { - if (seq_idx >= view->n_max_seq) { + if (seq_idx >= view->n_seq_max) { break; } cs_curr[seq_idx] = it; @@ -13272,7 +13272,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k if (seq_idx != 0) { used_cells++; } - for (; seq_idx < view->n_max_seq; seq_idx++) { + for (; seq_idx < view->n_seq_max; seq_idx++) { cs_curr[seq_idx] = -1; } } @@ -13934,12 +13934,12 @@ int32_t llama_tokenize( const char * text, int32_t text_len, llama_token * tokens, - int32_t n_max_tokens, + int32_t n_tokens_max, bool add_bos, bool special) { auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special); - if (n_max_tokens < (int) res.size()) { + if (n_tokens_max < (int) res.size()) { // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); return -((int) res.size()); } diff --git a/llama.h b/llama.h index ccf65ca4e..55cc92fe7 100644 --- a/llama.h +++ b/llama.h @@ -377,7 +377,7 @@ extern "C" { LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); - LLAMA_API uint32_t llama_n_max_seq (const struct llama_context * ctx); + LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); @@ -456,7 +456,7 @@ extern "C" { // Maximum number of sequences that can exist in a cell. It's not an error // if there are more sequences in a cell than this value, however they will // not be visible in the view cells_sequences. - int32_t n_max_seq; + int32_t n_seq_max; // Number of tokens in the cache. For example, if there are two populated // cells, the first with 1 sequence id in it and the second with 2 sequence @@ -476,12 +476,12 @@ extern "C" { // Information for an individual cell. struct llama_kv_cache_view_cell * cells; - // The sequences for each cell. There will be n_max_seq items per cell. + // The sequences for each cell. There will be n_seq_max items per cell. llama_seq_id * cells_sequences; }; // Create an empty KV cache view. (use only for debugging purposes) - LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq); + LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max); // Free a KV cache view. (use only for debugging purposes) LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view); @@ -708,7 +708,7 @@ extern "C" { /// @details Convert the provided text into tokens. /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. - /// @return Returns the number of tokens on success, no more than n_max_tokens + /// @return Returns the number of tokens on success, no more than n_tokens_max /// @return Returns a negative number on failure - the number of tokens that would have been returned /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. /// Does not insert a leading space. @@ -717,7 +717,7 @@ extern "C" { const char * text, int32_t text_len, llama_token * tokens, - int32_t n_max_tokens, + int32_t n_tokens_max, bool add_bos, bool special);