Using llama_tokenize() in tests

This commit is contained in:
jaime-m-p 2024-06-20 18:20:16 +02:00
parent eea8dfab6b
commit d779bab49c
2 changed files with 25 additions and 49 deletions

View file

@ -2906,51 +2906,35 @@ std::vector<llama_token> llama_tokenize(
} }
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::vector<char> result(8, 0); std::string piece;
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
if (n_tokens < 0) { const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), special);
result.resize(-n_tokens); if (n_chars < 0) {
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); piece.resize(-n_chars);
GGML_ASSERT(check == -n_tokens); int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), special);
} else { GGML_ASSERT(check == -n_chars);
result.resize(n_tokens); }
else {
piece.resize(n_chars);
} }
return std::string(result.data(), result.size()); return piece;
} }
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) { std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
const llama_token bos_id = llama_token_bos(llama_get_model(ctx)); std::string text;
text.resize(std::max(text.capacity(), tokens.size()));
std::string piece; int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), special);
std::string result; if (n_chars < 0) {
text.resize(-n_chars);
for (size_t i = 0; i < tokens.size(); ++i) { n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), special);
piece = llama_token_to_piece(ctx, tokens[i]); GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
// remove the leading space of the first non-BOS token
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
piece = piece.substr(1);
} }
result += piece; text.resize(n_chars);
}
return result;
}
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
std::string piece;
std::string result;
for (size_t i = 0; i < tokens.size(); ++i) {
piece = llama_token_to_piece(ctx, tokens[i]);
result += piece;
}
// NOTE: the original tokenizer decodes bytes after collecting the pieces. // NOTE: the original tokenizer decodes bytes after collecting the pieces.
return result; return text;
} }
bool llama_should_add_bos_token(const llama_model * model) { bool llama_should_add_bos_token(const llama_model * model) {

View file

@ -336,21 +336,13 @@ std::string llama_token_to_piece(
llama_token token, llama_token token,
bool special = true); bool special = true);
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
// that takes into account the tokenizer type and decides how to handle the leading space
//
// detokenizes a vector of tokens into a string // detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode` // should work similar to Python's `tokenizer.decode`
// removes the leading space from the first non-BOS token // optionally renders special/control tokens
std::string llama_detokenize_spm( std::string llama_detokenize(
llama_context * ctx, llama_context * ctx,
const std::vector<llama_token> & tokens); const std::vector<llama_token> & tokens,
bool special = true);
// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
std::string llama_detokenize_bpe(
llama_context * ctx,
const std::vector<llama_token> & tokens);
// Uses the value from the model metadata if possible, otherwise // Uses the value from the model metadata if possible, otherwise
// defaults to true when model type is SPM, otherwise false. // defaults to true when model type is SPM, otherwise false.