From cc1fd2fd0d1c99760b393b4bd233b091ee41f0c6 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 10 Feb 2025 09:47:18 +0100 Subject: [PATCH] llama : add remove_space_prefix to llama_detokenize This commit adds a new parameter to llama_detokenize to remove the leading space before tokens if they have a word boundary character. The motivation for this change is that when llama_server returns completion_propabilities, the tokens are detokenized and currently the leading space for the boundary tokens are removed. With this change llama_server can set remove_space_prefix to false and the leading space will be preserved. Resolves: https://github.com/ggerganov/llama.cpp/issues/11728 --- common/common.cpp | 10 +++++----- common/common.h | 6 ++++-- common/llguidance.cpp | 4 ++-- examples/server/server.cpp | 2 +- include/llama.h | 4 +++- src/llama-vocab.cpp | 27 ++++++++++++++++----------- src/llama-vocab.h | 6 ++++-- 7 files changed, 35 insertions(+), 24 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 8661e164a..cfcbc3ce7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1746,19 +1746,19 @@ std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token return piece; } -std::string common_detokenize(const struct llama_context * ctx, const std::vector & tokens, bool special) { +std::string common_detokenize(const struct llama_context * ctx, const std::vector & tokens, bool special, bool remove_space_prefix) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - return common_detokenize(vocab, tokens, special); + return common_detokenize(vocab, tokens, special, remove_space_prefix); } -std::string common_detokenize(const struct llama_vocab * vocab, const std::vector & tokens, bool special) { +std::string common_detokenize(const struct llama_vocab * vocab, const std::vector & tokens, bool special, bool remove_space_prefix) { std::string text; text.resize(std::max(text.capacity(), tokens.size())); - int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special, remove_space_prefix); if (n_chars < 0) { text.resize(-n_chars); - n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special, remove_space_prefix); GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization } diff --git a/common/common.h b/common/common.h index b208d0c7e..5bd18354c 100644 --- a/common/common.h +++ b/common/common.h @@ -601,12 +601,14 @@ std::string common_token_to_piece( std::string common_detokenize( const struct llama_context * ctx, const std::vector & tokens, - bool special = true); + bool special = true, + bool remove_space_prefix = true); std::string common_detokenize( const struct llama_vocab * vocab, const std::vector & tokens, - bool special = true); + bool special = true, + bool remove_space_prefix = true); // // Chat template utils diff --git a/common/llguidance.cpp b/common/llguidance.cpp index 2feeb93c8..925b55553 100644 --- a/common/llguidance.cpp +++ b/common/llguidance.cpp @@ -176,12 +176,12 @@ static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab) llama_token token = i; auto dp = (char *) token_bytes + offset; - auto size = llama_detokenize(vocab, &token, 1, dp, max_token, false, false); + auto size = llama_detokenize(vocab, &token, 1, dp, max_token, false, false, true); if (size < 0) { GGML_ABORT("llama_detokenize failed\n"); } if (size == 0) { - size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true); + size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true, true); if (size < 0) { GGML_ABORT("llama_detokenize failed\n"); } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 0718806c8..cc6a6f049 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2297,7 +2297,7 @@ struct server_context { for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) { result.probs.push_back({ cur[i].id, - common_detokenize(ctx, {cur[i].id}, special), + common_detokenize(ctx, {cur[i].id}, special, /* remove_space_prefix */ false), cur[i].p }); } diff --git a/include/llama.h b/include/llama.h index 3784f7d39..e524c4eb1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1025,6 +1025,7 @@ extern "C" { /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned. /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so. /// @param unparse_special If true, special tokens are rendered in the output. + /// @param remove_space_prefix If true, removes the leading space before tokens if they have a word boundary character. LLAMA_API int32_t llama_detokenize( const struct llama_vocab * vocab, const llama_token * tokens, @@ -1032,7 +1033,8 @@ extern "C" { char * text, int32_t text_len_max, bool remove_special, - bool unparse_special); + bool unparse_special, + bool remove_space_prefix); // // Chat templates diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index ad9ffe66a..bde514273 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1322,11 +1322,13 @@ struct llama_vocab::impl { char * text, int32_t text_len_max, bool remove_special, - bool unparse_special) const; + bool unparse_special, + bool remove_space_prefix = true) const; std::string detokenize( const std::vector & tokens, - bool special) const; + bool special, + bool remove_space_prefix = true) const; void print_info() const; @@ -2581,7 +2583,8 @@ int32_t llama_vocab::impl::detokenize( char * text, int32_t text_len_max, bool remove_special, - bool unparse_special) const { + bool unparse_special, + bool remove_space_prefix) const { if (type == LLAMA_VOCAB_TYPE_NONE) { return 0; } @@ -2592,7 +2595,7 @@ int32_t llama_vocab::impl::detokenize( int32_t total = 0; // remove the leading space - bool remove_space = add_space_prefix; + bool remove_space = add_space_prefix && remove_space_prefix; if (remove_special && add_bos) { if (n_tokens > 0 && tokens[0] == special_bos_id) { @@ -2991,17 +2994,18 @@ int32_t llama_vocab::detokenize( char * text, int32_t text_len_max, bool remove_special, - bool unparse_special) const { - return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special); + bool unparse_special, + bool remove_space_prefix) const { + return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special, remove_space_prefix); } -std::string llama_vocab::detokenize(const std::vector & tokens, bool special) const { +std::string llama_vocab::detokenize(const std::vector & tokens, bool special, bool remove_space_prefix) const { std::string text; text.resize(std::max(text.capacity(), tokens.size())); - int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special, remove_space_prefix); if (n_chars < 0) { text.resize(-n_chars); - n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special, remove_space_prefix); GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization } @@ -3246,7 +3250,8 @@ int32_t llama_detokenize( char * text, int32_t text_len_max, bool remove_special, - bool unparse_special) { - return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special); + bool unparse_special, + bool remove_space_prefix) { + return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special, remove_space_prefix); } diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 5ce355214..028284070 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -111,11 +111,13 @@ struct llama_vocab { char * text, int32_t text_len_max, bool remove_special, - bool unparse_special) const; + bool unparse_special, + bool remove_space_prefix = true) const; std::string detokenize( const std::vector & tokens, - bool special) const; + bool special, + bool remove_space_prefix = true) const; void print_info() const;