diff --git a/common/common.cpp b/common/common.cpp index 541baf268..2f7b77b27 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2924,10 +2924,10 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t std::string llama_detokenize(llama_context * ctx, const std::vector & tokens, bool special) { std::string text; text.resize(std::max(text.capacity(), tokens.size())); - int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), special); + int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); if (n_chars < 0) { text.resize(-n_chars); - n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), special); + n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization } diff --git a/llama.cpp b/llama.cpp index af310ee2e..039158370 100644 --- a/llama.cpp +++ b/llama.cpp @@ -18503,16 +18503,30 @@ int32_t llama_detokenize( int32_t n_tokens, char * text, int32_t text_len_max, - bool special) { + bool remove_special, + bool unparse_special) { // remove the leading space of the first non-control token static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL; - bool remove_space = !special && model->vocab.tokenizer_add_space_prefix; + bool remove_space = !unparse_special && model->vocab.tokenizer_add_space_prefix; int32_t avail = text_len_max; int32_t total = 0; + if (remove_special && model->vocab.tokenizer_add_bos) { + if (n_tokens > 0 && tokens[0] == model->vocab.special_bos_id) { + n_tokens--; + tokens++; + } + } + + if (remove_special && model->vocab.tokenizer_add_eos) { + if (n_tokens > 0 && tokens[n_tokens-1] == model->vocab.special_eos_id) { + n_tokens--; + } + } + for (int32_t i = 0; i < n_tokens; ++i) { GGML_ASSERT(avail >= 0); - int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, remove_space, special); + int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, remove_space, unparse_special); const llama_token_attr attr = llama_token_get_attr(model, tokens[i]); remove_space = remove_space && (attr & attr_special); // until non-control token if (n_chars < 0) { diff --git a/llama.h b/llama.h index 72bd555fd..62a78fcb5 100644 --- a/llama.h +++ b/llama.h @@ -874,6 +874,7 @@ extern "C" { /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. /// @return Returns the number of tokens on success, no more than n_tokens_max /// @return Returns a negative number on failure - the number of tokens that would have been returned + /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so. /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated /// as plaintext. Does not insert a leading space. LLAMA_API int32_t llama_tokenize( @@ -898,18 +899,20 @@ extern "C" { int32_t lstrip, bool special); - /// @details Convert the provided tokens into text. + /// @details Convert the provided tokens into text (inverse of llama_tokenize()). /// @param text The char pointer must be large enough to hold the resulting text. /// @return Returns the number of chars/bytes on success, no more than text_len_max. /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned. - /// @param special If true, special tokens are rendered in the output. + /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so. + /// @param unparse_special If true, special tokens are rendered in the output. LLAMA_API int32_t llama_detokenize( const struct llama_model * model, const llama_token * tokens, int32_t n_tokens, char * text, int32_t text_len_max, - bool special); + bool remove_special, + bool unparse_special); /// Apply chat template. Inspired by hf apply_chat_template() on python. /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 15ca5bf31..5d4f6e876 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -98,15 +98,15 @@ class LibLlamaModel: num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, len(self.token_ids), add_special, parse_special) return list(self.token_ids[0:num]) - def detokenize(self, ids: list[int], special: bool = False) -> str: + def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str: if len(self.token_ids) < len(ids): self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids)) for i, id in enumerate(ids): self.token_ids[i] = id - num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), special) + num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) while num < 0 and len(self.text_buff) < (16 << 20): self.text_buff = self.ffi.new("uint8_t[]", -2 * num) - num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), special) + num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) return str(self.ffi.buffer(self.text_buff, num), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' @@ -160,7 +160,7 @@ class TokenizerLlamaCpp (Tokenizer): return self.model.tokenize(text, add_special=True, parse_special=True) def decode(self, ids: list[int]) -> str: - return self.model.detokenize(ids, special=True) + return self.model.detokenize(ids, remove_special=False, unparse_special=True) def generator_custom_text() -> Iterator[str]: