diff --git a/common/common.cpp b/common/common.cpp index 73ff0e85b..2217547ab 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2906,51 +2906,35 @@ std::vector llama_tokenize( } std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { - std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); - if (n_tokens < 0) { - result.resize(-n_tokens); - int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); - GGML_ASSERT(check == -n_tokens); - } else { - result.resize(n_tokens); + std::string piece; + piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' + const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), special); + if (n_chars < 0) { + piece.resize(-n_chars); + int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), special); + GGML_ASSERT(check == -n_chars); + } + else { + piece.resize(n_chars); } - return std::string(result.data(), result.size()); + return piece; } -std::string llama_detokenize_spm(llama_context * ctx, const std::vector & tokens) { - const llama_token bos_id = llama_token_bos(llama_get_model(ctx)); - - std::string piece; - std::string result; - - for (size_t i = 0; i < tokens.size(); ++i) { - piece = llama_token_to_piece(ctx, tokens[i]); - - // remove the leading space of the first non-BOS token - if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') { - piece = piece.substr(1); - } - - result += piece; +std::string llama_detokenize(llama_context * ctx, const std::vector & tokens, bool special) { + std::string text; + text.resize(std::max(text.capacity(), tokens.size())); + int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), special); + if (n_chars < 0) { + text.resize(-n_chars); + n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), special); + GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization } - return result; -} - -std::string llama_detokenize_bpe(llama_context * ctx, const std::vector & tokens) { - std::string piece; - std::string result; - - for (size_t i = 0; i < tokens.size(); ++i) { - piece = llama_token_to_piece(ctx, tokens[i]); - - result += piece; - } + text.resize(n_chars); // NOTE: the original tokenizer decodes bytes after collecting the pieces. - return result; + return text; } bool llama_should_add_bos_token(const llama_model * model) { diff --git a/common/common.h b/common/common.h index 9a1dc4a2f..55a19c43f 100644 --- a/common/common.h +++ b/common/common.h @@ -336,21 +336,13 @@ std::string llama_token_to_piece( llama_token token, bool special = true); -// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function -// that takes into account the tokenizer type and decides how to handle the leading space -// // detokenizes a vector of tokens into a string // should work similar to Python's `tokenizer.decode` -// removes the leading space from the first non-BOS token -std::string llama_detokenize_spm( +// optionally renders special/control tokens +std::string llama_detokenize( llama_context * ctx, - const std::vector & tokens); - -// detokenizes a vector of tokens into a string -// should work similar to Python's `tokenizer.decode` -std::string llama_detokenize_bpe( - llama_context * ctx, - const std::vector & tokens); + const std::vector & tokens, + bool special = true); // Uses the value from the model metadata if possible, otherwise // defaults to true when model type is SPM, otherwise false.