From 37246b1031b1680c0dcaf20aef736d6b446203fa Mon Sep 17 00:00:00 2001 From: Kyle Mistele Date: Wed, 24 Apr 2024 05:15:29 -0500 Subject: [PATCH] common : revert showing control tokens by default for server (#6860) * fix: revert showing control tokens by default * feat: revert changes to default behavior of llama_token_to_piece; provide overridden declaration to receive "bool special" param to toggle showing control tokens * feat: use the overridden declaration of llama_token_to_piece from common/common.cpp to specify "false" so that control tokens are not shown in chat completion responses" * common : simplify --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 6 +++--- common/common.h | 5 +++-- examples/server/server.cpp | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index a0d1f8d59..97f55b053 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2328,12 +2328,12 @@ std::vector llama_tokenize( return result; } -std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { +std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true); + const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true); + int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); diff --git a/common/common.h b/common/common.h index cca44268e..157b54a3e 100644 --- a/common/common.h +++ b/common/common.h @@ -237,11 +237,12 @@ std::vector llama_tokenize( bool add_special, bool parse_special = false); -// tokenizes a token into a piece +// tokenizes a token into a piece, optionally renders special/control tokens // should work similar to Python's `tokenizer.id_to_piece` std::string llama_token_to_piece( const struct llama_context * ctx, - llama_token token); + llama_token token, + bool special = true); // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function // that takes into account the tokenizer type and decides how to handle the leading space diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 68c63f9f1..3acbd17df 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1117,7 +1117,7 @@ struct server_context { bool process_token(completion_token_output & result, server_slot & slot) { // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = llama_token_to_piece(ctx, result.tok); + const std::string token_str = llama_token_to_piece(ctx, result.tok, false); slot.sampled = result.tok; // search stop word and delete it